Repository: aiming-lab/AutoResearchClaw
Branch: main
Commit: 258dae2bb28f
Files: 422
Total size: 4.1 MB

Directory structure:
gitextract_tp1xyq09/

├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── config.researchclaw.example.yaml
├── docs/
│   ├── BUG_FIX_DOCUMENT_20260316.md
│   ├── BUG_TRACKER.md
│   ├── CHANGELOG_ANTHROPIC_ADAPTER.md
│   ├── PIPELINE_TEST_LOG_R5.md
│   ├── README_AR.md
│   ├── README_CN.md
│   ├── README_DE.md
│   ├── README_ES.md
│   ├── README_FR.md
│   ├── README_JA.md
│   ├── README_KO.md
│   ├── README_PT.md
│   ├── README_RU.md
│   ├── TESTER_GUIDE.md
│   ├── TESTER_GUIDE_CN.md
│   ├── TESTER_GUIDE_JA.md
│   ├── agent_figure_and_benchmark_plan.md
│   ├── figure_prompts/
│   │   ├── case_a_meta_learning.md
│   │   └── case_b_rlhf_alignment.md
│   ├── integration-guide.md
│   ├── issue_tracker_v9.md
│   ├── iteration_plan_v8.md
│   ├── iteration_showcase_narrative.md
│   ├── metaclaw-integration-plan.md
│   ├── next_phase_showcase_plan.md
│   ├── pipeline_critical_fixes_v8.md
│   ├── rate_limit_fix_plan.md
│   ├── sandbox_environment_fix_plan.md
│   └── showcase/
│       └── SHOWCASE.md
├── prompts.default.yaml
├── pyproject.toml
├── researchclaw/
│   ├── __init__.py
│   ├── __main__.py
│   ├── adapters.py
│   ├── agents/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── benchmark_agent/
│   │   │   ├── __init__.py
│   │   │   ├── acquirer.py
│   │   │   ├── orchestrator.py
│   │   │   ├── selector.py
│   │   │   ├── surveyor.py
│   │   │   └── validator.py
│   │   ├── code_searcher/
│   │   │   ├── __init__.py
│   │   │   ├── agent.py
│   │   │   ├── cache.py
│   │   │   ├── github_client.py
│   │   │   ├── pattern_extractor.py
│   │   │   └── query_gen.py
│   │   └── figure_agent/
│   │       ├── __init__.py
│   │       ├── codegen.py
│   │       ├── critic.py
│   │       ├── decision.py
│   │       ├── integrator.py
│   │       ├── nano_banana.py
│   │       ├── orchestrator.py
│   │       ├── planner.py
│   │       ├── renderer.py
│   │       └── style_config.py
│   ├── assessor/
│   │   ├── __init__.py
│   │   ├── comparator.py
│   │   ├── rubrics.py
│   │   ├── scorer.py
│   │   └── venue_recommender.py
│   ├── calendar/
│   │   ├── __init__.py
│   │   ├── data/
│   │   │   └── conferences.yaml
│   │   ├── deadlines.py
│   │   ├── planner.py
│   │   └── reminder.py
│   ├── cli.py
│   ├── collaboration/
│   │   ├── __init__.py
│   │   ├── dedup.py
│   │   ├── publisher.py
│   │   ├── repository.py
│   │   └── subscriber.py
│   ├── config.py
│   ├── copilot/
│   │   ├── __init__.py
│   │   ├── branching.py
│   │   ├── controller.py
│   │   ├── feedback.py
│   │   └── modes.py
│   ├── dashboard/
│   │   ├── __init__.py
│   │   ├── broadcaster.py
│   │   ├── collector.py
│   │   └── metrics.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── benchmark_knowledge.yaml
│   │   ├── dataset_registry.yaml
│   │   ├── docker_profiles.yaml
│   │   ├── framework_docs/
│   │   │   ├── axolotl.md
│   │   │   ├── llamafactory.md
│   │   │   ├── peft.md
│   │   │   ├── transformers_training.md
│   │   │   └── trl.md
│   │   └── seminal_papers.yaml
│   ├── docker/
│   │   ├── Dockerfile
│   │   ├── Dockerfile.biology
│   │   ├── Dockerfile.chemistry
│   │   ├── Dockerfile.economics
│   │   ├── Dockerfile.generic
│   │   ├── Dockerfile.math
│   │   ├── Dockerfile.physics
│   │   └── entrypoint.sh
│   ├── domains/
│   │   ├── __init__.py
│   │   ├── adapters/
│   │   │   ├── __init__.py
│   │   │   ├── biology.py
│   │   │   ├── chemistry.py
│   │   │   ├── economics.py
│   │   │   ├── generic.py
│   │   │   ├── math.py
│   │   │   ├── ml.py
│   │   │   ├── neuroscience.py
│   │   │   ├── physics.py
│   │   │   ├── robotics.py
│   │   │   └── security.py
│   │   ├── detector.py
│   │   ├── experiment_schema.py
│   │   ├── profiles/
│   │   │   ├── _generic.yaml
│   │   │   ├── biology_genomics.yaml
│   │   │   ├── biology_protein.yaml
│   │   │   ├── biology_singlecell.yaml
│   │   │   ├── chemistry_molprop.yaml
│   │   │   ├── chemistry_qm.yaml
│   │   │   ├── economics_empirical.yaml
│   │   │   ├── mathematics_numerical.yaml
│   │   │   ├── mathematics_optimization.yaml
│   │   │   ├── ml_compression.yaml
│   │   │   ├── ml_generative.yaml
│   │   │   ├── ml_generic.yaml
│   │   │   ├── ml_graph.yaml
│   │   │   ├── ml_nlp.yaml
│   │   │   ├── ml_rl.yaml
│   │   │   ├── ml_tabular.yaml
│   │   │   ├── ml_vision.yaml
│   │   │   ├── neuroscience_computational.yaml
│   │   │   ├── neuroscience_imaging.yaml
│   │   │   ├── physics_pde.yaml
│   │   │   ├── physics_quantum.yaml
│   │   │   ├── physics_simulation.yaml
│   │   │   ├── robotics_control.yaml
│   │   │   └── security_detection.yaml
│   │   └── prompt_adapter.py
│   ├── evolution.py
│   ├── experiment/
│   │   ├── __init__.py
│   │   ├── agentic_sandbox.py
│   │   ├── code_agent.py
│   │   ├── colab_sandbox.py
│   │   ├── docker_sandbox.py
│   │   ├── evaluators/
│   │   │   ├── __init__.py
│   │   │   └── convergence.py
│   │   ├── factory.py
│   │   ├── git_manager.py
│   │   ├── harness_template.py
│   │   ├── metrics.py
│   │   ├── runner.py
│   │   ├── sandbox.py
│   │   ├── ssh_sandbox.py
│   │   ├── validator.py
│   │   └── visualize.py
│   ├── feedback/
│   │   └── FEEDBACK_ANALYSIS_PROMPT.md
│   ├── hardware.py
│   ├── health.py
│   ├── knowledge/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   └── graph/
│   │       ├── __init__.py
│   │       ├── builder.py
│   │       ├── entities.py
│   │       ├── query.py
│   │       ├── relations.py
│   │       └── visualizer.py
│   ├── literature/
│   │   ├── __init__.py
│   │   ├── arxiv_client.py
│   │   ├── cache.py
│   │   ├── models.py
│   │   ├── novelty.py
│   │   ├── openalex_client.py
│   │   ├── search.py
│   │   ├── semantic_scholar.py
│   │   ├── trends.py
│   │   └── verify.py
│   ├── llm/
│   │   ├── __init__.py
│   │   ├── acp_client.py
│   │   ├── anthropic_adapter.py
│   │   └── client.py
│   ├── mcp/
│   │   ├── __init__.py
│   │   ├── client.py
│   │   ├── registry.py
│   │   ├── server.py
│   │   ├── tools.py
│   │   └── transport.py
│   ├── memory/
│   │   ├── __init__.py
│   │   ├── decay.py
│   │   ├── embeddings.py
│   │   ├── experiment_memory.py
│   │   ├── ideation_memory.py
│   │   ├── retriever.py
│   │   ├── store.py
│   │   └── writing_memory.py
│   ├── metaclaw_bridge/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── lesson_to_skill.py
│   │   ├── prm_gate.py
│   │   ├── session.py
│   │   ├── skill_feedback.py
│   │   └── stage_skill_map.py
│   ├── overleaf/
│   │   ├── __init__.py
│   │   ├── conflict.py
│   │   ├── formatter.py
│   │   ├── sync.py
│   │   └── watcher.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── _domain.py
│   │   ├── _helpers.py
│   │   ├── code_agent.py
│   │   ├── contracts.py
│   │   ├── executor.py
│   │   ├── experiment_diagnosis.py
│   │   ├── experiment_repair.py
│   │   ├── opencode_bridge.py
│   │   ├── paper_verifier.py
│   │   ├── runner.py
│   │   ├── stage_impls/
│   │   │   ├── __init__.py
│   │   │   ├── _analysis.py
│   │   │   ├── _code_generation.py
│   │   │   ├── _execution.py
│   │   │   ├── _experiment_design.py
│   │   │   ├── _literature.py
│   │   │   ├── _paper_writing.py
│   │   │   ├── _review_publish.py
│   │   │   ├── _synthesis.py
│   │   │   └── _topic.py
│   │   ├── stages.py
│   │   └── verified_registry.py
│   ├── project/
│   │   ├── __init__.py
│   │   ├── idea_pool.py
│   │   ├── manager.py
│   │   ├── models.py
│   │   └── scheduler.py
│   ├── prompts.py
│   ├── quality.py
│   ├── report.py
│   ├── server/
│   │   ├── __init__.py
│   │   ├── app.py
│   │   ├── dialog/
│   │   │   ├── __init__.py
│   │   │   ├── intents.py
│   │   │   ├── router.py
│   │   │   └── session.py
│   │   ├── middleware/
│   │   │   ├── __init__.py
│   │   │   └── auth.py
│   │   ├── routes/
│   │   │   ├── __init__.py
│   │   │   ├── chat.py
│   │   │   ├── pipeline.py
│   │   │   ├── projects.py
│   │   │   └── voice.py
│   │   └── websocket/
│   │       ├── __init__.py
│   │       ├── events.py
│   │       └── manager.py
│   ├── servers/
│   │   ├── __init__.py
│   │   ├── cloud_executor.py
│   │   ├── dispatcher.py
│   │   ├── monitor.py
│   │   ├── registry.py
│   │   ├── slurm_executor.py
│   │   └── ssh_executor.py
│   ├── skills/
│   │   ├── __init__.py
│   │   ├── builtin/
│   │   │   ├── __init__.py
│   │   │   ├── domain/
│   │   │   │   ├── cv-classification/
│   │   │   │   │   └── SKILL.md
│   │   │   │   ├── cv-detection/
│   │   │   │   │   └── SKILL.md
│   │   │   │   ├── nlp-alignment/
│   │   │   │   │   └── SKILL.md
│   │   │   │   ├── nlp-pretraining/
│   │   │   │   │   └── SKILL.md
│   │   │   │   └── rl-policy-optimization/
│   │   │   │       └── SKILL.md
│   │   │   ├── experiment/
│   │   │   │   ├── experimental-design/
│   │   │   │   │   └── SKILL.md
│   │   │   │   ├── meta-analysis/
│   │   │   │   │   └── SKILL.md
│   │   │   │   └── systematic-review/
│   │   │   │       └── SKILL.md
│   │   │   └── tooling/
│   │   │       ├── data-loading/
│   │   │       │   └── SKILL.md
│   │   │       ├── distributed-training/
│   │   │       │   └── SKILL.md
│   │   │       ├── mixed-precision/
│   │   │       │   └── SKILL.md
│   │   │       └── pytorch-training/
│   │   │           └── SKILL.md
│   │   ├── loader.py
│   │   ├── matcher.py
│   │   ├── registry.py
│   │   └── schema.py
│   ├── templates/
│   │   ├── __init__.py
│   │   ├── compiler.py
│   │   ├── conference.py
│   │   ├── converter.py
│   │   ├── results_table_builder.py
│   │   └── styles/
│   │       ├── iclr_2025/
│   │       │   ├── iclr2025_conference.bst
│   │       │   └── iclr2025_conference.sty
│   │       ├── iclr_2026/
│   │       │   ├── iclr2026_conference.bst
│   │       │   └── iclr2026_conference.sty
│   │       ├── icml_2025/
│   │       │   ├── icml2025.bst
│   │       │   └── icml2025.sty
│   │       ├── icml_2026/
│   │       │   ├── icml2026.bst
│   │       │   └── icml2026.sty
│   │       ├── neurips_2024/
│   │       │   └── neurips_2024.sty
│   │       └── neurips_2025/
│   │           └── neurips_2025.sty
│   ├── trends/
│   │   ├── __init__.py
│   │   ├── auto_topic.py
│   │   ├── daily_digest.py
│   │   ├── feeds.py
│   │   ├── opportunity_finder.py
│   │   └── trend_analyzer.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── sanitize.py
│   │   └── thinking_tags.py
│   ├── voice/
│   │   ├── __init__.py
│   │   ├── commands.py
│   │   ├── synthesizer.py
│   │   └── transcriber.py
│   ├── web/
│   │   ├── __init__.py
│   │   ├── _ssrf.py
│   │   ├── agent.py
│   │   ├── crawler.py
│   │   ├── pdf_extractor.py
│   │   ├── scholar.py
│   │   └── search.py
│   ├── wizard/
│   │   ├── __init__.py
│   │   ├── quickstart.py
│   │   ├── templates.py
│   │   └── validator.py
│   └── writing_guide.py
├── scripts/
│   ├── metaclaw_start.sh
│   ├── plot_iteration_showcase.py
│   ├── test_beast_mode_e2e.py
│   ├── test_code_agent_live.py
│   ├── test_code_agent_sandbox.py
│   └── test_codegen_v2.py
├── sentinel.sh
├── tests/
│   ├── __init__.py
│   ├── conftest.py
│   ├── e2e_docker_sandbox.py
│   ├── e2e_real_llm.py
│   ├── test_anthropic.py
│   ├── test_assessor.py
│   ├── test_benchmark_agent.py
│   ├── test_calendar.py
│   ├── test_cli.py
│   ├── test_code_agent.py
│   ├── test_code_searcher.py
│   ├── test_collaboration.py
│   ├── test_compiler.py
│   ├── test_convergence_evaluator.py
│   ├── test_copilot.py
│   ├── test_decision_agent.py
│   ├── test_domain_detector.py
│   ├── test_entry_point_validation.py
│   ├── test_experiment_diagnosis.py
│   ├── test_experiment_repair.py
│   ├── test_experiment_schema.py
│   ├── test_figure_agent.py
│   ├── test_knowledge_graph.py
│   ├── test_mcp.py
│   ├── test_memory_system.py
│   ├── test_metaclaw_bridge/
│   │   ├── __init__.py
│   │   ├── test_config.py
│   │   ├── test_lesson_to_skill.py
│   │   ├── test_prm_gate.py
│   │   ├── test_session.py
│   │   ├── test_skill_feedback.py
│   │   └── test_stage_skill_map.py
│   ├── test_metric_parser.py
│   ├── test_minimax_provider.py
│   ├── test_neuroscience_domain.py
│   ├── test_opencode_bridge.py
│   ├── test_overleaf.py
│   ├── test_paper_verifier.py
│   ├── test_project_manager.py
│   ├── test_prompt_adapter.py
│   ├── test_rc_adapters.py
│   ├── test_rc_cache.py
│   ├── test_rc_checkpoint.py
│   ├── test_rc_citation_resolve.py
│   ├── test_rc_citation_verify.py
│   ├── test_rc_cli.py
│   ├── test_rc_config.py
│   ├── test_rc_contracts.py
│   ├── test_rc_docker_sandbox.py
│   ├── test_rc_e2e_regression.py
│   ├── test_rc_evolution.py
│   ├── test_rc_executor.py
│   ├── test_rc_hardware.py
│   ├── test_rc_health.py
│   ├── test_rc_kb.py
│   ├── test_rc_literature.py
│   ├── test_rc_llm.py
│   ├── test_rc_novelty.py
│   ├── test_rc_preflight.py
│   ├── test_rc_prompts.py
│   ├── test_rc_quality.py
│   ├── test_rc_report.py
│   ├── test_rc_runner.py
│   ├── test_rc_sanitization.py
│   ├── test_rc_sentinel.py
│   ├── test_rc_stages.py
│   ├── test_rc_templates.py
│   ├── test_rc_validator.py
│   ├── test_results_table_builder.py
│   ├── test_robotics_adapter.py
│   ├── test_servers.py
│   ├── test_skills_library.py
│   ├── test_ssh_and_colab_sandbox.py
│   ├── test_trends.py
│   ├── test_universal_codegen_integration.py
│   ├── test_v6_improvements.py
│   ├── test_verified_registry.py
│   ├── test_web_crawler.py
│   ├── test_web_integration.py
│   ├── test_web_pdf_extractor.py
│   ├── test_web_platform.py
│   ├── test_web_scholar.py
│   └── test_web_search.py
└── website/
    ├── features.html
    ├── getting-started.html
    ├── index.html
    ├── papers.html
    ├── pipeline.html
    └── style.css

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
HANDOFF_METACLAW_SKILL_LOOP.md
.venv/
__pycache__/
*.pyc
*.egg-info/
dist/
build/
workspaces/
.claude/*
!.claude/agents/
!.claude/agents/*.md
!.claude/skills/
!.claude/skills/**/SKILL.md
.claude/settings.local.json

# Experiment run artifacts (local only)
artifacts/
output/
experiment_metaclaw/
promotional/

# Legacy experiment artifacts (pre-v5)
exp/
logs/
writing/

# Root-level config (local overrides, not committed)
/config.yaml

# Sensitive / credentials
user_token_cache.json
*.secret
.env
.env.*
config_run*.yaml

# Literature search cache
.researchclaw_cache/

# Playwright MCP logs
.playwright-mcp/

# Internal dev/debug docs (not for public)
docs/internal/
docs/kb/
docs/plans/
docs/BUGFIX_TRACKER*.md
docs/IMPROVEMENT_PLAN*.md
docs/IMPROVEMENT_*_EXECUTION.md
docs/OPTIMIZATION_PLAN*.md
docs/MULTI_CASE_EVALUATION*.md
docs/pipeline_quality_issues*.md
docs/autobench-loop.md
RESEARCHCLAW_AGENTS.md
RESEARCHCLAW_CLAUDE.md

# Task-specific config files (keep example template only)
config_agent_*.yaml
config_case*.yaml
config_v8_case*.yaml
pipeline_run_*.log
benchmarks/

# Logo generation prompts
image/logo_prompt.md

# macOS
.DS_Store

run.log

# Misc temp files
.history/
.serena/
cli_pause
暂停
进入
连续失败
重试一次

.venv_arc/
/config.arc.yaml

config_*.yaml

# Frontend (local dev only)
frontend/

# Test outputs and run logs (local only)
test_outputs*/
records/
run*_full_log.txt
mdpdf.log
scripts/md2pdf.py

# Local docs (not for public)
docs/tasks/
docs/feature_expansion_analysis.*
docs/tester_guide_cn.*


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to AutoResearchClaw

## Setup

1. Fork and clone the repo
2. Create a venv and install with dev extras:
   ```
   python3 -m venv .venv && source .venv/bin/activate
   pip install -e ".[dev]"
   ```
3. Generate your local config:
   ```
   researchclaw init
   ```
4. Edit `config.arc.yaml` with your LLM settings

## Config Convention

- `config.researchclaw.example.yaml` — tracked template (do not add secrets)
- `config.arc.yaml` — your local config (gitignored, created by `researchclaw init`)
- `config.yaml` — also gitignored, supported as fallback

## Running Tests

```
pytest tests/
```

## Checking Your Environment

```
researchclaw doctor
```

## PR Guidelines

- Branch from main
- One concern per PR
- Ensure `pytest tests/` passes
- Include tests for new functionality


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2026 Aiming Lab

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
<p align="center">
  <img src="image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Chat an Idea. Get a Paper. Fully Autonomous & Self-Evolving.</b></h2>


<p align="center">
  <b><i><font size="5">Just chat with <a href="#openclaw-integration">OpenClaw</a>: "Research X" → done.</font></i></b>
</p>

<p align="center">
  <img src="image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#testing"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#openclaw-integration"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="docs/README_CN.md">🇨🇳 中文</a> ·
  <a href="docs/README_JA.md">🇯🇵 日本語</a> ·
  <a href="docs/README_KO.md">🇰🇷 한국어</a> ·
  <a href="docs/README_FR.md">🇫🇷 Français</a> ·
  <a href="docs/README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="docs/README_ES.md">🇪🇸 Español</a> ·
  <a href="docs/README_PT.md">🇧🇷 Português</a> ·
  <a href="docs/README_RU.md">🇷🇺 Русский</a> ·
  <a href="docs/README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="docs/showcase/SHOWCASE.md">🏆 Paper Showcase</a> · <a href="docs/integration-guide.md">📖 Integration Guide</a> · <a href="https://discord.gg/u4ksqW5P">💬 Discord Community</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="docs/showcase/SHOWCASE.md"><img src="docs/showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 Generated Paper Showcase</b><br><br>
<b>8 papers across 8 domains</b> — math, statistics, biology, computing, NLP, RL, vision, robustness — generated fully autonomously with zero human intervention.<br><br>
<a href="docs/showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 We're looking for testers!** Try the pipeline with your own research idea — from any field — and [tell us what you think](docs/TESTER_GUIDE.md). Your feedback directly shapes the next version. **[→ Testing Guide](docs/TESTER_GUIDE.md)** | **[→ 中文测试指南](docs/TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](docs/TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Cross-Platform Support + Major Stability** — AutoResearchClaw now runs on any ACP-compatible agent backend (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) and supports messaging platforms (Discord, Telegram, Lark, WeChat) via OpenClaw bridge. New CLI-agent code generation backend delegates Stages 10 & 13 to external CLI agents with budget control and timeout management. Also includes anti-fabrication system (VerifiedRegistry + experiment diagnosis & repair loop), 100+ bug fixes, modular executor refactoring, `--resume` auto-detection, LLM retry hardening, and community-reported fixes. 
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ One Command. One Paper.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 What Is This?

**You think it. AutoResearchClaw writes it.**

Drop a research topic — get back a full academic paper with real literature from OpenAlex, Semantic Scholar & arXiv, hardware-aware sandbox experiments (GPU/MPS/CPU auto-detected), statistical analysis, multi-agent peer review, and conference-ready LaTeX targeting NeurIPS/ICML/ICLR. No babysitting. No copy-pasting. No hallucinated references.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Full academic paper (Introduction, Related Work, Method, Experiments, Results, Conclusion)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>Conference-ready LaTeX (NeurIPS / ICLR / ICML templates)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>Real BibTeX references from OpenAlex, Semantic Scholar and arXiv — auto-pruned to match inline citations</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>4-layer citation integrity + relevance verification (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Generated code + sandbox results + structured JSON metrics</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Auto-generated condition comparison charts with error bars and confidence intervals</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Multi-agent peer review with methodology-evidence consistency checks</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Self-learning lessons extracted from each run</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>All final outputs in one folder — compile-ready for Overleaf</td></tr>
</table>

The pipeline runs **end-to-end without human intervention**. When experiments fail, it self-heals. When hypotheses don't hold, it pivots. When citations are fake, it kills them.

🌍 **Run it anywhere.** AutoResearchClaw isn't locked to a single platform. Use it standalone via CLI, plug it into [OpenClaw](https://github.com/openclaw/openclaw), or wire it up through any ACP-compatible agent — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, you name it. And because OpenClaw bridges to messaging platforms, you can kick off a full research run from 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, or wherever your team already hangs out. One topic in, one paper out — no matter where you type it.

---

## 🚀 Quick Start

```bash
# 1. Clone & install
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Setup (interactive — installs OpenCode beast mode, checks Docker/LaTeX)
researchclaw setup

# 3. Configure
researchclaw init          # Interactive: choose LLM provider, creates config.arc.yaml
# Or manually: cp config.researchclaw.example.yaml config.arc.yaml

# 4. Run
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

Output → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — compile-ready LaTeX, BibTeX, experiment code, charts.

<details>
<summary>📝 Minimum required config</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 What Makes It Different

| Capability | How It Works |
|-----------|-------------|
| **🔄 PIVOT / REFINE Loop** | Stage 15 autonomously decides: PROCEED, REFINE (tweak params), or PIVOT (new direction). Artifacts auto-versioned. |
| **🤖 Multi-Agent Debate** | Hypothesis generation, result analysis, and peer review each use structured multi-perspective debate. |
| **🧬 Self-Learning** | Lessons extracted per run (decision rationale, runtime warnings, metric anomalies) with 30-day time-decay. Future runs learn from past mistakes. |
| **📚 Knowledge Base** | Every run builds structured KB across 6 categories (decisions, experiments, findings, literature, questions, reviews). |
| **🛡️ Sentinel Watchdog** | Background quality monitor: NaN/Inf detection, paper-evidence consistency, citation relevance scoring, anti-fabrication guard. |

---

## 🦞 OpenClaw Integration

<table>
<tr>

**AutoResearchClaw is an [OpenClaw](https://github.com/openclaw/openclaw)-compatible service.** Install it in OpenClaw and launch autonomous research with a single message — or use it standalone via CLI, Claude Code, or any AI coding assistant.

</tr>
</table>

### 🚀 Use with OpenClaw (Recommended)

If you already use [OpenClaw](https://github.com/openclaw/openclaw) as your AI assistant:

```
1️⃣  Share the GitHub repo URL with OpenClaw
2️⃣  OpenClaw auto-reads RESEARCHCLAW_AGENTS.md → understands the pipeline
3️⃣  Say: "Research [your topic]"
4️⃣  Done — OpenClaw clones, installs, configures, runs, and returns results
```

**That's it.** OpenClaw handles `git clone`, `pip install`, config setup, and pipeline execution automatically. You just chat.

<details>
<summary>💡 What happens under the hood</summary>

1. OpenClaw reads `RESEARCHCLAW_AGENTS.md` → learns the research orchestrator role
2. OpenClaw reads `README.md` → understands installation and pipeline structure
3. OpenClaw copies `config.researchclaw.example.yaml` → `config.yaml`
4. Asks for your LLM API key (or uses your environment variable)
5. Runs `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. Returns the paper, LaTeX, experiments, and citations

</details>

### 🔌 OpenClaw Bridge (Advanced)

For deeper integration, AutoResearchClaw includes a **bridge adapter system** with 6 optional capabilities:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Scheduled research runs
  use_message: true           # 💬 Progress notifications (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Cross-session knowledge persistence
  use_sessions_spawn: true    # 🔀 Spawn parallel sub-sessions for concurrent stages
  use_web_fetch: true         # 🌐 Live web search during literature review
  use_browser: false          # 🖥️ Browser-based paper collection
```

Each flag activates a typed adapter protocol. When OpenClaw provides these capabilities, the adapters consume them without code changes. See [`docs/integration-guide.md`](docs/integration-guide.md) for full details.

### ACP (Agent Client Protocol)

AutoResearchClaw can use **any ACP-compatible coding agent** as its LLM backend — no API keys required. The agent communicates via [acpx](https://github.com/openclaw/acpx), maintaining a single persistent session across all 23 pipeline stages.

| Agent | Command | Notes |
|-------|---------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ACP example
llm:
  provider: "acp"
  acp:
    agent: "claude"   # Any ACP-compatible agent CLI command
    cwd: "."          # Working directory for the agent
  # No base_url or api_key needed — the agent handles its own auth.
```

```bash
# Just run — the agent uses its own credentials
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ Other Ways to Run

| Method | How |
|--------|-----|
| **Standalone CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Reads `RESEARCHCLAW_CLAUDE.md` — just say *"Run research on [topic]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` with `llm.acp.agent: "gh"` |
| **OpenCode** | Reads `.claude/skills/` — same natural language interface |
| **Any AI CLI** | Provide `RESEARCHCLAW_AGENTS.md` as context → agent auto-bootstraps |

---

## 🔬 Pipeline: 23 Stages, 8 Phases

```
Phase A: Research Scoping          Phase E: Experiment Execution
  1. TOPIC_INIT                      12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE               13. ITERATIVE_REFINE  ← self-healing

Phase B: Literature Discovery      Phase F: Analysis & Decision
  3. SEARCH_STRATEGY                 14. RESULT_ANALYSIS    ← multi-agent
  4. LITERATURE_COLLECT  ← real API  15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [gate]
  6. KNOWLEDGE_EXTRACT               Phase G: Paper Writing
                                     16. PAPER_OUTLINE
Phase C: Knowledge Synthesis         17. PAPER_DRAFT
  7. SYNTHESIS                       18. PEER_REVIEW        ← evidence check
  8. HYPOTHESIS_GEN    ← debate      19. PAPER_REVISION

Phase D: Experiment Design         Phase H: Finalization
  9. EXPERIMENT_DESIGN   [gate]      20. QUALITY_GATE      [gate]
 10. CODE_GENERATION                 21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING               22. EXPORT_PUBLISH     ← LaTeX
                                     23. CITATION_VERIFY    ← relevance check
```

> **Gate stages** (5, 9, 20) pause for human approval or auto-approve with `--auto-approve`. On rejection, the pipeline rolls back.

> **Decision loops**: Stage 15 can trigger REFINE (→ Stage 13) or PIVOT (→ Stage 8), with automatic artifact versioning.

<details>
<summary>📋 What Each Phase Does</summary>

| Phase | What Happens |
|-------|-------------|
| **A: Scoping** | LLM decomposes the topic into a structured problem tree with research questions |
| **A+: Hardware** | Auto-detects GPU (NVIDIA CUDA / Apple MPS / CPU-only), warns if local hardware is limited, adapts code generation accordingly |
| **B: Literature** | Multi-source search (OpenAlex → Semantic Scholar → arXiv) for real papers, screens by relevance, extracts knowledge cards |
| **C: Synthesis** | Clusters findings, identifies research gaps, generates testable hypotheses via multi-agent debate |
| **D: Design** | Designs experiment plan, generates hardware-aware runnable Python (GPU tier → package selection), estimates resource needs |
| **E: Execution** | Runs experiments in sandbox, detects NaN/Inf and runtime bugs, self-heals code via targeted LLM repair |
| **F: Analysis** | Multi-agent analysis of results; autonomous PROCEED / REFINE / PIVOT decision with rationale |
| **G: Writing** | Outlines → section-by-section drafting (5,000-6,500 words) → peer reviews (with methodology-evidence consistency) → revises with length guard |
| **H: Finalization** | Quality gate, knowledge archival, LaTeX export with conference template, citation integrity + relevance verification |

</details>

---

## ✨ Key Features

| Feature | Description |
|---------|------------|
| **📚 Multi-Source Literature** | Real papers from OpenAlex, Semantic Scholar & arXiv — query expansion, deduplication, circuit breaker with graceful degradation |
| **🔍 4-Layer Citation Verification** | arXiv ID check → CrossRef/DataCite DOI → Semantic Scholar title match → LLM relevance scoring. Hallucinated refs auto-removed. |
| **🖥️ Hardware-Aware Execution** | Auto-detects GPU (NVIDIA CUDA / Apple MPS / CPU-only) and adapts code generation, imports, and experiment scale accordingly |
| **🦾 OpenCode Beast Mode** | Complex experiments auto-routed to [OpenCode](https://github.com/anomalyco/opencode) — generates multi-file projects with custom architectures, training loops, and ablation studies. Install via `researchclaw setup`. |
| **🧪 Sandbox Experiments** | AST-validated code, immutable harness, NaN/Inf fast-fail, self-healing repair, iterative refinement (up to 10 rounds), partial result capture |
| **📝 Conference-Grade Writing** | NeurIPS/ICML/ICLR templates, section-by-section drafting (5,000-6,500 words), anti-fabrication guard, revision length guard, anti-disclaimer enforcement |
| **📐 Template Switching** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX with math, tables, figures, cross-refs, `\cite{}` |
| **🛡️ Anti-Fabrication** | VerifiedRegistry enforces ground-truth experiment data in papers. Auto-diagnoses failed experiments and repairs them before writing. Unverified numbers sanitized. |
| **🚦 Quality Gates** | 3 human-in-the-loop gates (Stages 5, 9, 20) with rollback. Skip with `--auto-approve`. |

---

## 🧠 MetaClaw Integration

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = A pipeline that learns from every run.**

MetaClaw adds **cross-run knowledge transfer** to AutoResearchClaw. When enabled, the pipeline automatically captures lessons from failures and warnings, converts them into reusable skills, and injects those skills into all 23 pipeline stages on subsequent runs — so the same mistakes are never repeated.

### How It Works

```
Run N executes → failures/warnings captured as Lessons
                      ↓
          MetaClaw Lesson → Skill conversion
                      ↓
          arc-* Skill files stored in ~/.metaclaw/skills/
                      ↓
Run N+1 → build_overlay() injects skills into every LLM prompt
                      ↓
          LLM avoids known pitfalls → higher quality, fewer retries
```

### Quick Setup

```bash
# 1. Install MetaClaw (if not already)
pip install metaclaw

# 2. Enable in your config
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # MetaClaw proxy (optional)
  skills_dir: "~/.metaclaw/skills"          # Where skills are stored
  fallback_url: "https://api.openai.com/v1" # Direct LLM fallback
  fallback_api_key: ""                      # API key for fallback URL
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Convert warnings + errors
    max_skills_per_run: 3
```

```bash
# 3. Run as usual — MetaClaw works transparently
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

After each run, check `~/.metaclaw/skills/arc-*/SKILL.md` to see the skills your pipeline has learned.

### Experiment Results

In controlled A/B experiments (same topic, same LLM, same configuration):

| Metric | Baseline | With MetaClaw | Improvement |
|--------|----------|---------------|-------------|
| Stage retry rate | 10.5% | 7.9% | **-24.8%** |
| Refine cycle count | 2.0 | 1.2 | **-40.0%** |
| Pipeline stage completion | 18/19 | 19/19 | **+5.3%** |
| Overall robustness score (composite) | 0.714 | 0.845 | **+18.3%** |

> Composite robustness score is a weighted average of stage completion rate (40%), retry reduction (30%), and refine cycle efficiency (30%).

### Backward Compatibility

- **Default: OFF.** If `metaclaw_bridge` is absent or `enabled: false`, the pipeline behaves exactly as before.
- **No new dependencies.** MetaClaw is optional — the core pipeline works without it.
- **All 1,823 existing tests pass** with the integration code present.

---

## ⚙️ Configuration Reference

<details>
<summary>Click to expand full configuration reference</summary>

```yaml
# === Project ===
project:
  name: "my-research"              # Project identifier
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Research ===
research:
  topic: "..."                     # Research topic (required)
  domains: ["ml", "nlp"]           # Research domains for literature search
  daily_paper_count: 8             # Target papers per search query
  quality_threshold: 4.0           # Minimum quality score for papers

# === Runtime ===
runtime:
  timezone: "America/New_York"     # For timestamps
  max_parallel_tasks: 3            # Concurrent experiment limit
  approval_timeout_hours: 12       # Gate stage timeout
  retry_limit: 2                   # Retry count on stage failure

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # API endpoint (required for openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Env var for API key (required for openai-compatible)
  api_key: ""                      # Or hardcode key here
  primary_model: "gpt-4o"          # Primary model
  fallback_models: ["gpt-4o-mini"] # Fallback chain
  s2_api_key: ""                   # Semantic Scholar API key (optional, higher rate limits)
  acp:                             # Only used when provider: "acp"
    agent: "claude"                # ACP agent CLI command (claude, codex, gemini, etc.)
    cwd: "."                       # Working directory for the agent

# === Experiment ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Max execution time per run (default: 300s)
  max_iterations: 10               # Max optimization iterations
  metric_key: "val_loss"           # Primary metric name
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Auto-detect imports → requirements.txt
  ssh_remote:
    host: ""                       # GPU server hostname
    gpu_ids: []                    # Available GPU IDs
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (auto-installed via `researchclaw setup`)
    enabled: true                    # Master switch (default: true)
    auto: true                       # Auto-trigger without confirmation (default: true)
    complexity_threshold: 0.2        # 0.0-1.0 — higher = only trigger on complex experiments
    model: ""                        # Override model (empty = use llm.primary_model)
    timeout_sec: 600                 # Max seconds for OpenCode generation
    max_retries: 1                   # Retry count on failure
    workspace_cleanup: true          # Remove temp workspace after collection
  code_agent:                        # CodeAgent v2 — multi-phase code generation
    enabled: true                    # Use CodeAgent instead of legacy single-prompt codegen
    architecture_planning: true      # Generate deep implementation blueprint before coding
    sequential_generation: true      # Generate files one-by-one following dependency DAG
    hard_validation: true            # AST-based validation gates (blocks identical ablations, hardcoded metrics)
    hard_validation_max_repairs: 2   # Max repair attempts when validation fails
    exec_fix_max_iterations: 3       # Execution-in-the-loop fix attempts
    exec_fix_timeout_sec: 60         # Timeout per exec-fix attempt
  benchmark_agent:                   # BenchmarkAgent — automated dataset & baseline selection
    enabled: true                    # Enable 4-agent benchmark pipeline (Surveyor→Selector→Acquirer→Validator)
    enable_hf_search: true           # Search HuggingFace Datasets
    enable_web_search: true          # Search Google Scholar for benchmarks
    tier_limit: 2                    # Dataset tier filtering (1=small/cached, 2=medium, 3=large)
    min_benchmarks: 1                # Minimum datasets required
    min_baselines: 2                 # Minimum baseline methods required
  figure_agent:                      # FigureAgent — academic figure generation
    enabled: true                    # Enable 5-agent figure pipeline (Planner→CodeGen→Renderer→Critic→Integrator)
    min_figures: 3                   # Minimum figures to generate
    max_figures: 8                   # Maximum figures
    max_iterations: 3                # Critic-driven refinement iterations
    dpi: 300                         # Output resolution
    strict_mode: false               # Fail pipeline if figure generation fails
  repair:                            # Anti-fabrication experiment repair
    enabled: true                    # Auto-diagnose and repair failed experiments
    max_cycles: 3                    # Repair retry loops
    min_completion_rate: 0.5         # >=50% conditions must complete to proceed
    min_conditions: 2                # At least 2 conditions for valid experiment
    use_opencode: true               # Route repairs through OpenCode Beast Mode

# === Web Search (Optional) ===
web_search:
  enabled: true                      # Enable web-augmented literature search
  tavily_api_key_env: "TAVILY_API_KEY"  # Tavily API key env var (optional)
  enable_scholar: true               # Google Scholar search
  enable_pdf_extraction: true        # Extract text from PDFs
  max_web_results: 10                # Max web results per query

# === Export ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # Path to custom prompts YAML (empty = defaults)

# === Security ===
security:
  hitl_required_stages: [5, 9, 20] # Stages requiring human approval
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === Knowledge Base ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Notifications ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge (Optional) ===
metaclaw_bridge:
  enabled: false                   # Set to true to enable cross-run learning
  proxy_url: "http://localhost:30000"  # MetaClaw proxy URL
  skills_dir: "~/.metaclaw/skills" # Where arc-* skills are stored
  fallback_url: ""                 # Direct LLM fallback when proxy is down
  fallback_api_key: ""             # API key for fallback endpoint
  lesson_to_skill:
    enabled: true                  # Auto-convert lessons to skills
    min_severity: "warning"        # Minimum severity to convert
    max_skills_per_run: 3          # Max new skills per pipeline run
  prm:                             # Process Reward Model quality gate (optional)
    enabled: false                 # Use LLM-as-judge to score stage outputs
    model: "gpt-5.4"              # PRM judge model
    votes: 3                       # Majority vote count
    gate_stages: [5, 9, 15, 20]   # Stages to apply PRM gates

# === OpenClaw Bridge ===
openclaw_bridge:
  use_cron: false                  # Scheduled research runs
  use_message: false               # Progress notifications
  use_memory: false                # Cross-session knowledge persistence
  use_sessions_spawn: false        # Spawn parallel sub-sessions
  use_web_fetch: false             # Live web search
  use_browser: false               # Browser-based paper collection
```

</details>

---

## 🙏 Acknowledgments

Inspired by:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Automated research pioneer
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — End-to-end research automation
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System

---

## 📄 License

MIT — see [LICENSE](LICENSE) for details.

---

## 📌 Citation

If you find AutoResearchClaw useful, please cite:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Built with 🦞 by the AutoResearchClaw team</sub>
</p>


================================================
FILE: config.researchclaw.example.yaml
================================================
project:
  name: "my-research"
  mode: "full-auto"

research:
  topic: "Your research topic here"
  domains:
    - "machine-learning"
  daily_paper_count: 10
  quality_threshold: 4.0

runtime:
  timezone: "America/New_York"
  max_parallel_tasks: 3
  approval_timeout_hours: 12
  retry_limit: 2

notifications:
  channel: "console"
  target: ""
  on_stage_start: true
  on_stage_fail: true
  on_gate_required: true

knowledge_base:
  backend: "markdown"
  root: "docs/kb"

openclaw_bridge:
  use_cron: false
  use_message: false
  use_memory: false
  use_sessions_spawn: false
  use_web_fetch: false
  use_browser: false

llm:
  provider: "openai-compatible"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  api_key: ""
  primary_model: "gpt-4o"
  fallback_models:
    - "gpt-4.1"
    - "gpt-4o-mini"
  # --- MiniMax provider example ---
  # provider: "minimax"
  # api_key_env: "MINIMAX_API_KEY"
  # primary_model: "MiniMax-M2.5"
  # fallback_models:
  #   - "MiniMax-M2.5-highspeed"

security:
  hitl_required_stages: [5, 9, 20]
  allow_publish_without_approval: false
  redact_sensitive_logs: true

experiment:
  # ★ mode 决定实验结果的真实性
  #   "sandbox"   — 在本地沙盒中实际执行生成的 Python 代码，产出真实实验数据
  #   "docker"    — 在 Docker 容器中执行，支持 GPU 直通、依赖自动安装、内存隔离
  #   "simulated" — 不执行代码，使用公式生成假数据（仅用于框架开发调试，不应用于论文生成）
  mode: "sandbox"
  time_budget_sec: 300
  max_iterations: 10
  metric_key: "primary_metric"
  metric_direction: "minimize"
  sandbox:
    # Use ".venv/Scripts/python.exe" on Windows
    python_path: ".venv/bin/python3"
    gpu_required: false
    max_memory_mb: 4096
  # Docker sandbox settings (only used when mode: "docker")
  # Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/
  docker:
    image: "researchclaw/experiment:latest"
    gpu_enabled: true
    # gpu_device_ids: [0]        # empty = all GPUs
    memory_limit_mb: 8192
    network_policy: "setup_only"  # none | setup_only | pip_only | full
    # pip_pre_install: ["torchdiffeq", "einops"]
    auto_install_deps: true
    shm_size_mb: 2048
    keep_containers: false
  ssh_remote:
    host: ""                # SSH hostname or IP
    user: ""                # SSH username (default: current user)
    port: 22                # SSH port
    key_path: ""            # Path to private key (default: ~/.ssh/id_rsa)
    gpu_ids: []             # e.g. [0, 1] for CUDA_VISIBLE_DEVICES
    remote_workdir: "/tmp/researchclaw_experiments"
    remote_python: "python3"
    setup_commands: []      # e.g. ["source ~/venv/bin/activate", "pip install torch"]
    # Docker-over-SSH (most secure remote execution)
    use_docker: false       # Set true to run experiments inside Docker on remote host
    docker_image: "researchclaw/experiment:latest"
    docker_network_policy: "none"   # none | full
    docker_memory_limit_mb: 8192
    docker_shm_size_mb: 2048
  # OpenCode Beast Mode — external AI coding agent for complex experiments
  # Install: npm i -g opencode-ai@latest  (or use `researchclaw setup`)
  opencode:
    enabled: true                # Master switch (default: true)
    auto: true                   # Auto-trigger without confirmation (default: true)
    complexity_threshold: 0.2    # 0.0-1.0 — higher = only trigger on complex experiments
    model: ""                    # Override model (empty = use llm.primary_model)
    timeout_sec: 600             # Max seconds for OpenCode generation
    max_retries: 1               # Retry count on failure
    workspace_cleanup: true      # Remove temp workspace after collection

# ============================================================================
# SSH Remote Examples
# ============================================================================
#
# 1. Lab server (bare Python, basic sandboxing):
#   experiment:
#     mode: "ssh_remote"
#     ssh_remote:
#       host: "gpu-server.lab.edu"
#       user: "researcher"
#       key_path: "~/.ssh/id_rsa"
#       gpu_ids: [0]
#       remote_python: "python3"
#
# 2. Lab server (Docker — most secure):
#   experiment:
#     mode: "ssh_remote"
#     ssh_remote:
#       host: "gpu-server.lab.edu"
#       user: "researcher"
#       key_path: "~/.ssh/id_rsa"
#       gpu_ids: [0]
#       use_docker: true
#       docker_image: "researchclaw/experiment:latest"
#       docker_network_policy: "none"
#
# 3. Colab via SSH tunnel:
#   experiment:
#     mode: "ssh_remote"
#     ssh_remote:
#       host: "localhost"
#       port: 12345
#       user: "root"
#       remote_python: "python3"
#       setup_commands:
#         - "pip install torch torchvision -q"
#
# 4. Colab via Google Drive (most robust, no SSH needed):
#   experiment:
#     mode: "colab_drive"
#     colab_drive:
#       drive_root: "~/Library/CloudStorage/GoogleDrive-you@gmail.com/My Drive/researchclaw"
#       poll_interval_sec: 30
#       timeout_sec: 3600
#       setup_script: "pip install torch torchvision -q"
#   # Then in Colab: run the colab_worker.py that appears in your Drive

  colab_drive:
    drive_root: ""          # Local path to Google Drive mount
    poll_interval_sec: 30   # How often to check for results
    timeout_sec: 3600       # Max wait per experiment (1 hour)
    setup_script: ""        # Shell commands to run before each experiment

  # Scientific Visualization Agent (Code-to-Viz + Nano Banana)
  # Uses a Decision Agent to analyze paper content and determine:
  #   - Code figures (bar charts, line plots) → Matplotlib/TikZ
  #   - Image figures (architecture, flowcharts) → Gemini Nano Banana
  figure_agent:
    enabled: true
    min_figures: 3
    max_figures: 10
    max_iterations: 3
    render_timeout_sec: 30
    # Security: Docker sandbox for visualization code execution
    # use_docker: null  # null = auto-detect, true = force, false = disable
    docker_image: "researchclaw/experiment:latest"
    # Output format: "python" (Matplotlib/Seaborn) or "latex" (TikZ/PGFPlots)
    output_format: "python"
    # Nano Banana (Gemini native image generation)
    nano_banana_enabled: true
    # gemini_api_key: ""  # or set GEMINI_API_KEY env var
    gemini_model: "gemini-2.5-flash-image"
    strict_mode: false
    dpi: 300

# === Prompts ===
# Customize LLM prompts by pointing to your own YAML file.
# Copy prompts.default.yaml, edit the prompts you want, and set the path here.
prompts:
  custom_file: ""  # e.g. "my_prompts.yaml" (empty = use built-in defaults)


# === MetaClaw Integration ===
# Enable the MetaClaw bridge to get skill injection, PRM quality gates,
# and continuous learning from research pipeline failures.
# Requires MetaClaw to be running: metaclaw start --mode skills_only
metaclaw_bridge:
  enabled: false
  proxy_url: "http://localhost:30000"    # MetaClaw proxy endpoint
  skills_dir: "~/.metaclaw/skills"       # MetaClaw skills directory
  fallback_url: ""                       # Direct LLM URL if proxy is down
  fallback_api_key: ""

  # PRM quality gate: LLM-as-judge scoring at gate stages
  prm:
    enabled: false
    api_base: ""                         # OpenAI-compatible API for PRM judge
    api_key_env: "PRM_API_KEY"
    api_key: ""
    model: "gpt-5.4"
    votes: 3                             # Majority vote count
    gate_stages: [5, 9, 15, 20]          # Stages to apply PRM gating

  # Lesson-to-skill: auto-convert pipeline failures into MetaClaw skills
  lesson_to_skill:
    enabled: true
    min_severity: "error"                # Only convert error-level lessons
    max_skills_per_run: 3

================================================
FILE: docs/BUG_FIX_DOCUMENT_20260316.md
================================================
# Bug Fix Document — AutoResearchClaw Pipeline

> 生成日期：2026-03-16
> 反馈来源：2 位测试者（user1: CV 方向 / GPU 环境, user2: Windows 环境）
> 总计问题：9 个

## 📊 总览

| 分类 | 数量 |
|------|------|
| 🔴 确认的 Bug（需修复） | **4** |
| 🟠 架构改进（强烈建议） | **2** |
| 🔵 功能需求 | **3** |

## 🔥 修复优先级

| 优先级 | ID | 问题 | 阶段 | 涉及文件 |
|--------|----|------|------|----------|
| 🔴 CRITICAL | BUG-001 | 论文硬件信息与实际不一致 | PAPER_DRAFT (17) | `executor.py`, `prompts.py` |
| 🔴 CRITICAL | BUG-002 | Windows 环境 Docker 不可用导致实验链式失败 | EXPERIMENT_RUN (12) | `factory.py`, `docker_sandbox.py` |
| 🔴 HIGH | BUG-003 | 论文内容自相矛盾（承诺评测但未执行） | PAPER_DRAFT (17), PEER_REVIEW (18) | `executor.py`, `prompts.py` |
| 🔴 HIGH | BUG-004 | 生成代码缺少数值稳定性防护（NaN/Inf） | CODE_GENERATION (10) | `code_agent.py`, `prompts.py` |
| 🟠 HIGH | ARCH-001 | Stage 17 过于严格的 hard block 策略 | PAPER_DRAFT (17) | `executor.py` |
| 🟠 HIGH | ARCH-002 | Idea 降级时不询问用户确认 | EXPERIMENT_DESIGN (9), RESEARCH_DECISION (15) | `executor.py`, `stages.py` |

---

## 确认的 Bug — 详细修复方案

### 🔴 `BUG-001` — 论文硬件信息与实际机器不一致

| 字段 | 内容 |
|------|------|
| **严重程度** | CRITICAL |
| **所属阶段** | PAPER_DRAFT (Stage 17) |
| **报告者** | user1 |

**问题描述：**
论文中声称使用 A100 GPU 训练，但测试者实际机器上是 A5000。Pipeline 在 Stage 1 检测了硬件并保存到 `hardware_profile.json`，但在论文生成阶段完全没有利用这个信息来约束 LLM 输出。

**根因分析：**
- `executor.py` 第 1226-1233 行：Stage 1 (TOPIC_INIT) 检测硬件，保存 `hardware_profile.json`，包含 `gpu_name`、`vram_gb` 等
- `executor.py` 第 2352-2391 行：硬件信息 **仅** 用于 CODE_GENERATION 阶段的代码生成 hints
- `executor.py` 第 5776-5848 行：PAPER_DRAFT 阶段构建 prompt 时，**没有注入硬件 profile 信息**
- LLM 在缺少约束的情况下会「幻觉」出常见的高端硬件名称（如 A100）

**涉及文件：**
- `researchclaw/pipeline/executor.py`（PAPER_DRAFT 阶段的 prompt 构建部分，约第 5776-5960 行）
- `researchclaw/prompts.py`（paper writing prompt 模板）

**修复方案：**
1. 在 PAPER_DRAFT 阶段的 prompt 构建中，读取 `stage-01/hardware_profile.json`
2. 将实际硬件信息（GPU 型号、VRAM、CPU 等）作为 **硬性约束** 注入 prompt，例如：
   ```
   HARDWARE CONSTRAINT: The experiments were run on the following hardware:
   - GPU: {gpu_name} ({vram_gb} GB VRAM)
   - CPU: {cpu_info}
   You MUST use this exact hardware specification in the paper. Do NOT substitute with other GPU models.
   ```
3. 在 PEER_REVIEW (Stage 18) 的 prompt 中增加一条审核规则：验证 paper 中提到的硬件是否与 `hardware_profile.json` 一致

**修复后预期行为：**
论文中的硬件描述必须与实际运行环境一致。

<details>
<summary>原始反馈证据</summary>

> 然后就是paper和实验中有一些misalign的地方，比如paper里写说用的A100，实际上机器里的是A5000
</details>

---

### 🔴 `BUG-002` — Windows 环境下 Docker 不可用导致实验链式失败

| 字段 | 内容 |
|------|------|
| **严重程度** | CRITICAL |
| **所属阶段** | EXPERIMENT_RUN (Stage 12) → 链式影响到 Stage 13, 14, 17 |
| **报告者** | user2 |

**问题描述：**
在 Windows 环境下，Docker 不可用时 Pipeline 直接崩溃（`[WinError 2] The system cannot find the file specified`），导致所有后续阶段连锁失败。用户最终看到的是 Stage 17 的误导性错误「没有实验数据无法写论文」，完全看不到真正的根因。

**根因分析：**
- `experiment/factory.py` 第 25-29 行：当 `config.experiment.mode == "docker"` 时调用 `DockerSandbox.check_docker_available()`，如果 Docker 不可用直接 raise `RuntimeError`，**没有自动 fallback 到 subprocess sandbox**
- `docker_sandbox.py` 第 337、366 行：Docker volume mount 使用 POSIX 风格路径（如 `{staging_dir}:/workspace`），在 Windows 上可能导致挂载失败
- **链式失败：** Stage 12 crash → 无 metrics → Stage 13 空跑（`refine_sandbox_v1` 到 `v9` 都失败） → Stage 14 空 `experiment_summary.json` → Stage 17 hard block
- 用户看到的错误完全不提 Docker，只说「no metrics」，非常误导

**涉及文件：**
- `researchclaw/experiment/factory.py`（第 25-29 行，sandbox 创建逻辑）
- `researchclaw/experiment/docker_sandbox.py`（第 337、366、384 行，路径和命令构建）
- `researchclaw/pipeline/executor.py`（第 6000-6020 行，Stage 17 hard block）

**修复方案：**
1. `factory.py`：当 Docker 不可用时，自动 fallback 到 subprocess sandbox 模式，而不是 raise RuntimeError。增加日志 warning 告知用户：
   ```python
   if not DockerSandbox.check_docker_available():
       logger.warning("Docker not available, falling back to subprocess sandbox mode")
       return SubprocessSandbox(...)
   ```
2. `docker_sandbox.py`：修复 Windows 路径兼容性问题，使用 `pathlib.PureWindowsPath` 或 `os.path` 正确处理跨平台路径
3. 在 Stage 12 的错误信息中明确指出是 Docker 问题，而不是让错误沿链传播变成「no metrics」

**修复后预期行为：**
Windows 用户即使没有 Docker，Pipeline 也能通过 subprocess sandbox 完成实验。即使实验部分失败，错误信息应清晰指向根因。

<details>
<summary>原始反馈证据</summary>

> 我跑了两次 两次都有stage fail 最后没有生成报告

压缩包中 `experiment_summary.json` stderr: `[WinError 2] The system cannot find the file specified`
`pipeline_summary.json`: `"final_status": "failed"`, `"stages_failed": 1`
`stage-17/paper_draft.md`: `Experiment stage produced no metrics (status: failed/timeout). Cannot write a paper without real experimental data.`
</details>

---

### 🔴 `BUG-003` — 论文内容自相矛盾（承诺评测数据集但未实际执行）

| 字段 | 内容 |
|------|------|
| **严重程度** | HIGH |
| **所属阶段** | PAPER_DRAFT (Stage 17), PEER_REVIEW (Stage 18) |
| **报告者** | user1 |

**问题描述：**
论文前半部分按照用户的 topic 描述声称会在 MME、DocVQA、TextVQA 等数据集上评测，但实际实验阶段因为环境原因未能完成这些评测。论文后半部分在 Limitation 中又说「没有在这些数据集上评估」，形成自相矛盾。

**根因分析：**
- `prompts.py` 第 2006-2018 行：有 EVIDENCE-BOUNDING RULES（Rule 7-9），但这些只是 prompt 中的 **建议**，LLM 可以忽略
- `executor.py` 第 5647-5715 行：`_detect_result_contradictions()` 函数检测 null/negative results，但只生成 advisory text 注入 prompt，**不做硬性阻断**
- `executor.py` 第 6432-6443 行：PEER_REVIEW 阶段收集 `actual_run_count` 作为 evidence，但 **没有自动扫描 paper 文本提取声称的数据集列表并与实际评测记录对比**
- 核心问题：**缺少 claim-evidence 的自动对齐验证**

**涉及文件：**
- `researchclaw/pipeline/executor.py`（第 5647-5715 行、5944-5956 行、6432-6443 行）
- `researchclaw/prompts.py`（第 2006-2049 行、2124-2138 行）

**修复方案：**
1. 在 PAPER_DRAFT 阶段的 prompt 中，**明确列出** 实际完成评测的数据集和指标（从 `experiment_summary.json` 提取），硬性要求 LLM **只能**声称在这些数据集上进行了评测：
   ```
   ACTUAL EVALUATED DATASETS: [ImageNet-val (reconstruction)]
   You MUST NOT claim evaluation on any dataset not listed above.
   If the original research plan included additional datasets that were not evaluated,
   explain this honestly in the Limitations section WITHOUT first claiming you did evaluate them.
   ```
2. 在 PEER_REVIEW (Stage 18) 增加一个专项检查：自动提取 paper 中所有提到的 benchmark/dataset 名称，与 `experiment_summary.json` 中的实际 metrics keys 对比，不一致则标记为 CRITICAL discrepancy
3. 在 PAPER_REVISION (Stage 19) 中把这些 discrepancy 作为必须修改的 reviewer comment

**修复后预期行为：**
论文中不会出现「前面说评测了 X，后面说没评测 X」的自相矛盾。所有评测声明必须有实验数据支撑。

<details>
<summary>原始反馈证据</summary>

> 以及就是paper中有一些自相矛盾的地方，比如前面按照我的要求，说会在哪几个数据集上面进行评估，后面又没有测，然后在limitation说我们没有在这几个数据集上评估
</details>

---

### 🔴 `BUG-004` — 生成代码缺少数值稳定性防护（NaN/Inf 导致实验提前终止）

| 字段 | 内容 |
|------|------|
| **严重程度** | HIGH |
| **所属阶段** | CODE_GENERATION (Stage 10), ITERATIVE_REFINE (Stage 13) |
| **报告者** | user1 |

**问题描述：**
实验训练过程中出现 `loss = inf` → `loss = nan` 的数值爆炸，触发 harness 的 NaN 检测后实验提前终止。代码生成阶段没有在生成的训练代码中加入数值稳定性保护。

**根因分析：**
- `code_agent.py`：**完全没有** 关于数值稳定性的 prompt 指令。4 个阶段（Planning → Code Generation → Execution-in-the-Loop → Multi-Agent Review）都不检查 NaN guard
- `experiment/harness_template.py` 第 45-62 行：有 `check_value()` 做 NaN/Inf 检测，但这是 **opt-in 机制**——只有生成代码主动调用 `self.check_value(loss, "loss")` 才有效
- `executor.py` 第 779-900 行：`_detect_runtime_issues()` 在运行 **之后** 检测 NaN，但此时实验已经失败了
- `executor.py` 第 3915-3956 行：Stage 13 检测到 NaN 后调用 LLM 做 `iterative_repair`，但修复质量不稳定

**涉及文件：**
- `researchclaw/pipeline/code_agent.py`（prompt 构建，所有阶段）
- `researchclaw/prompts.py`（代码生成相关 prompt）
- `researchclaw/experiment/harness_template.py`（第 45-62 行）

**修复方案：**
1. 在 `code_agent.py` 的代码生成 prompt 中，增加 **强制性** 数值稳定性要求：
   ```
   NUMERICAL STABILITY REQUIREMENTS (MANDATORY):
   - Add gradient clipping (max_norm=1.0) to all optimizer steps
   - Check loss for NaN/Inf before backward pass: if not math.isfinite(loss): skip this batch
   - Use torch.amp.GradScaler for mixed precision training if applicable
   - Add learning rate warmup for the first 5-10% of training steps
   - Use self.check_value(loss, "loss") from experiment harness for NaN tracking
   ```
2. 在 `harness_template.py` 中，将 `check_value()` 改为 **自动 hook** 而非 opt-in——在 `finalize()` 中自动检查 metrics 是否为 finite
3. 在 Multi-Agent Review 阶段（`code_agent.py` Phase 4）增加数值稳定性作为必审项

**修复后预期行为：**
生成的训练代码默认包含 gradient clipping 和 NaN guard，训练过程中数值爆炸能被及时 catch 并恢复，而不是直接终止。

<details>
<summary>原始反馈证据</summary>

> 好像是他的代码写错了之类的

压缩包中 `experiment_summary.json` stderr:
```
WARNING: loss = inf (non-finite, skipped)
WARNING: loss = nan (non-finite, skipped)
WARNING: loss = nan (non-finite, skipped)
WARNING: loss = nan (non-finite, skipped)
WARNING: loss = nan (non-finite, skipped)
FAIL: Too many NaN/Inf values detected. Stopping experiment early.
```
</details>

---

## 架构改进 — 强烈建议

### 🟠 `ARCH-001` — Stage 17 (PAPER_DRAFT) 过于严格的 hard block 策略

| 字段 | 内容 |
|------|------|
| **严重程度** | HIGH |
| **所属阶段** | PAPER_DRAFT (Stage 17) |
| **报告者** | user2（链式影响） |

**问题描述：**
当实验阶段没有产出完整 metrics 时，Stage 17 直接 FAILED，不尝试用已有数据写论文。这导致前面 1-16 阶段的全部成果被浪费。

**根因分析：**
- `executor.py` 第 6000-6020 行：当 `has_real_metrics == False` 且 domain 为 empirical 时，直接返回 `StageStatus.FAILED`
- Stage 13 (ITERATIVE_REFINE) 的中间迭代可能产出了部分有效 metrics，但 Stage 17 只看 `experiment_summary.json` 的 final best_run

**涉及文件：**
- `researchclaw/pipeline/executor.py`（第 6000-6020 行）

**修复方案：**
将 hard block 改为 soft degradation：
1. 如果有部分 metrics（即使不完整），用已有数据写论文
2. 在 prompt 中明确告知 LLM 数据不完整，要求在 Abstract 和 Limitations 中如实说明
3. 只有在 **完全没有任何数据**（甚至没有 stage-07 synthesis 和 stage-08 hypotheses）的极端情况下才 hard block
4. 在输出的 `paper_draft.md` 头部加 warning 标记，方便后续阶段识别

**修复后预期行为：**
实验部分失败时，Pipeline 仍能生成一篇带有诚实 Limitations 的论文，用户至少得到有价值的输出。

---

### 🟠 `ARCH-002` — Idea 被降级到弱版本时不询问用户

| 字段 | 内容 |
|------|------|
| **严重程度** | HIGH |
| **所属阶段** | EXPERIMENT_DESIGN (Stage 9), RESEARCH_DECISION (Stage 15) |
| **报告者** | user1 |

**问题描述：**
用户给了一个复杂的 strong idea（如 VAE+ViT 统一编码器 + 多数据集评测），Pipeline 因资源限制（数据集不可用、GPU 不够、环境配不好）自动降级到 weaker 版本，但不通知或征求用户意见。用户认为降级后的研究「变得没啥意义」。

**根因分析：**
- `executor.py` 第 2220-2236 行：LLM 生成的实验计划无效时，使用 topic-derived fallback，**不询问用户**
- `executor.py` 第 4618-4640 行：RESEARCH_DECISION 检测 degenerate cycle 时只给 LLM advisory，**不暂停**
- `stages.py` 第 109-115 行：GATE_STAGES 只包含 Stage 5、9、20，不包含 Stage 15
- `agents/benchmark_agent/orchestrator.py` 第 314-322 行：BenchmarkAgent 验证失败时 silent retry，最终 silent proceed

**涉及文件：**
- `researchclaw/pipeline/executor.py`（第 2220-2236 行、4618-4640 行）
- `researchclaw/pipeline/stages.py`（GATE_STAGES 定义）
- `researchclaw/agents/benchmark_agent/orchestrator.py`（第 314-322 行）

**修复方案：**
1. 在 EXPERIMENT_DESIGN (Stage 9) 中，当检测到 significant downgrade（如：用户要求的数据集不可用、GPU 不满足要求、关键组件被简化）时，生成一个 **downgrade summary** 并暂停等待用户确认
2. 在 RESEARCH_DECISION (Stage 15) 中，将 REFINE → weaker idea 的决策标记为 GATE，需要用户 approve
3. 可以通过 `auto_approve` 参数让用户选择是否跳过这些确认（保持向后兼容）

**修复后预期行为：**
Pipeline 在降级研究方案前通知用户，用户可以选择：接受降级、提供更多资源（如更大的 GPU）、或终止当前 run。

<details>
<summary>原始反馈证据</summary>

> 对，还有就是比如我提出了一个相对strong的idea，而他因为各种原因（比如数据集找不到，环境配不好，gpu不够）之类的，给我fallback到weaker的idea之后，我感觉这个时候应该询问一下用户要不要继续跑
>
> 因为很多时候他继续跑的内容就会变得没啥意义
</details>

---

## 功能需求

### 🔵 `FEAT-001` — 论文生成后增加一致性反馈循环

- **报告者：** user1
- **描述：** 在论文生成之后，增加专门的 consistency check，检查 paper 中的声明与实际实验结果是否一致
- **建议：** 可以在 PEER_REVIEW (Stage 18) 的 prompt 中增加 claim-evidence alignment 专项检查。或者在 Stage 17 和 18 之间加一个轻量级的自动验证步骤

<details>
<summary>原始反馈</summary>

> 感觉这个可以在paper生成之后，加一些相关的consistence feedback之类的？
</details>

### 🔵 `FEAT-002` — 从 Related Works 的 GitHub 学习 Common Practice

- **报告者：** user1
- **描述：** 当前 Pipeline 的 literature 阶段只读论文，不看对应的开源代码。用户建议访问 related works 的 GitHub repo，学习 paper 中不会写的实现细节（tricks、common practice），缓解论文内容过于古老的问题
- **建议：** 在 KNOWLEDGE_EXTRACT (Stage 6) 或 EXPERIMENT_DESIGN (Stage 9) 增加 GitHub repo 分析能力。可以用 GitHub API 搜索 related works 的 repo，提取 README、主要代码结构、训练配置等信息

<details>
<summary>原始反馈</summary>

> 对就是我觉得即使不拿来用，visit related works的github也是有必要的，这样可以看到其他工作的common practice（一些不会在paper中出现的细节），应该会挺有用的。感觉可以缓解一下paper内容过于古老的问题
</details>

### 🔵 `FEAT-003` — 代码应该复用 Related Works 的框架

- **报告者：** user1
- **描述：** 当前代码都是 LLM 从零写的简单文件，用户建议从 most related works 中选一个合适的框架来用，就像真实研究中的做法
- **建议：** 可以在 BenchmarkAgent 或 CODE_GENERATION 阶段增加框架选择逻辑——从相关论文的开源实现中挑选合适的 codebase 作为起点，而不是从零生成。这是一个较大的改动，可以作为长期目标

<details>
<summary>原始反馈</summary>

> 以及他现在写的代码都比较简单，都是自己写几个文件对吧。我在想或许可以从most related works里面选一个合适的框架来用？我们平时也是这样的对吧。当然这个比较复杂，可以先不考虑
</details>

---

## 附录：按测试者分组

### 测试者：`user1`
- **学科/领域：** 计算机视觉（CV），统一图像编解码器
- **运行环境：** GPU 服务器（A5000），使用 Codex 监控
- **总计问题：** 6
- **确认 Bug：** 3（BUG-001, BUG-003, BUG-004）
- **架构改进：** 1（ARCH-002）
- **功能需求：** 3（FEAT-001, FEAT-002, FEAT-003）

| ID | 问题 | 状态 | 严重程度 |
|----|------|------|---------|
| BUG-001 | 论文硬件信息与实际不一致 | confirmed | CRITICAL |
| BUG-003 | 论文内容自相矛盾 | confirmed | HIGH |
| BUG-004 | 代码缺少数值稳定性防护 | confirmed | HIGH |
| ARCH-002 | Idea 降级不询问用户 | confirmed | HIGH |
| FEAT-001 | 一致性反馈循环 | feature_request | — |
| FEAT-002 | 从 GitHub 学习 common practice | feature_request | — |
| FEAT-003 | 复用 related works 框架 | feature_request | — |

### 测试者：`user2`
- **学科/领域：** 未知（topic 与纳米药物递送相关）
- **运行环境：** Windows
- **总计问题：** 2
- **确认 Bug：** 1（BUG-002）
- **架构改进：** 1（ARCH-001）

| ID | 问题 | 状态 | 严重程度 |
|----|------|------|---------|
| BUG-002 | Windows Docker 链式失败 | confirmed | CRITICAL |
| ARCH-001 | Stage 17 过于严格的 hard block | confirmed | HIGH |

---

## 修复执行指引

> 本文档设计为可由另一台机器上的 Claude Code agent 直接读取并执行修复。
> 建议按优先级从上到下依次修复，每修复一个 Bug 运行相关测试验证。

**修复顺序建议：**
1. BUG-002（Docker fallback）→ 解除 Windows 用户的完全阻塞
2. BUG-001（硬件一致性）→ 简单修复，prompt 注入即可
3. BUG-004（NaN guard）→ prompt 层面修复，影响面大
4. BUG-003（claim-evidence 对齐）→ 需要新增验证逻辑
5. ARCH-001（soft degradation）→ 改变 Stage 17 策略
6. ARCH-002（用户确认 Gate）→ 需要状态机和 Gate 逻辑调整


================================================
FILE: docs/BUG_TRACKER.md
================================================
# Bug Tracker & TODO

> 实验运行期间发现的 bug 和待修复事项。实验结束后统一修复。

## 已发现的 Bug

### BUG-01: Stage 2 合约缺少 queries.json 输出 (已修复)
- **状态**: ✅ 已修复 (commit `19c74a0`)
- **描述**: `contracts.py` 中 Stage 2 (PROBLEM_DECOMPOSE) 的 `output_files` 包含 `queries.json`，但实际实现只生成 `problem_tree.md`。`queries.json` 实际在 Stage 3 生成。
- **影响**: Pipeline 在 Stage 2 直接失败
- **修复**: 从 Stage 2 output_files 移除 `queries.json`，从 Stage 3 input_files 移除 `queries.json`

### BUG-02: gpt-5.4 持续 429 限流
- **状态**: ⏳ 待观察
- **描述**: 同时运行多个 pipeline 时，gpt-5.4 频繁返回 429。fallback 机制可以兜底但速度大幅下降。
- **影响**: 运行时间显著增加（Case 2 上轮从 ~2.5h 增至 ~6h）
- **建议**: 考虑增加 pipeline 间的启动间隔，或实现全局 API 调用速率协调

### BUG-03: S2/arXiv 文献搜索 429 限流
- **状态**: ✅ 已缓解 (commit `63c5a7d` circuit breaker)
- **描述**: Semantic Scholar 和 arXiv API 在并发请求时频繁 429
- **影响**: 文献收集阶段延迟，但 circuit breaker 保证最终完成

### BUG-04: Stage 10 深度质量检查 — 类方法不足
- **状态**: ✅ 已加强 (远程 commit `855c201`)
- **描述**: 生成的代码中多个类只有 1 个非 dunder 方法，质量检查报告 "algorithm classes should have at least __init__ + one core method"
- **影响**: 代码质量评分降低，但不阻塞 pipeline
- **远程修复**: 新增 Check 6 — ablation 子类必须 override 父类至少一个非 dunder 方法，否则报警告。修复写入 `validator.py` 和 `executor.py` 的 repair prompt。

### BUG-05: Stage 10 深度质量检查 — UnboundLocalError 风险
- **状态**: ✅ 已修复 (远程 commit `855c201`)
- **描述**: 生成代码中变量只在 if 分支内赋值，但在分支外使用（如 main.py:289 `mask`, main.py:300 `out` 等）
- **影响**: 生成的实验代码可能在运行时崩溃
- **远程修复**: 新增 `auto_fix_unbound_locals()` 函数（`validator.py`），在 Stage 10 代码生成后自动检测 if-only 变量并插入 `var = None` 初始化。`executor.py` 在深度检查前调用。

### BUG-05 更新: UnboundLocalError 问题在 v8r3 中大幅恶化
- **状态**: ✅ 已修复 (被 `auto_fix_unbound_locals()` 覆盖)
- **描述**: v8r3 中 Case 3 (PEFT) 生成的代码有 **47 处** UnboundLocalError 风险（data.py 27 处, methods.py 20 处, main.py 2 处），远超 v8r2 的 8 处。Case 2 也有 8 处。
- **根因**: LLM 生成的代码模式为 `if cond: x = val` 后直接 `use(x)`，缺少 else 分支或默认值初始化
- **远程修复**: 程序化自动修复已集成到 Stage 10 pipeline 中

### BUG-06: P9 Metric direction mismatch
- **状态**: ✅ 已修复
- **描述**: 配置写 `minimize` 但实验代码声明 `direction=higher`，自动纠正为 `maximize`
- **影响**: 可能影响实验结果的正确性
- **修复**: (1) Stage 9 prompt 中注入 `metric_direction` 约束; (2) Stage 12 code_generation prompt 中强制 METRIC_DEF direction 与 config 一致; (3) 取消 auto-correction，改为仅 warn 并保持 config 值

### BUG-07: Stage 23 CITATION_VERIFY 失败率高
- **状态**: ✅ 已修复
- **描述**: 上轮 Case 1 和 Case 3 都在 Stage 23 失败（28/29），仅 Case 2 通过
- **影响**: 最终 pipeline 状态标记为 failed
- **根因**: (1) `_check_citation_relevance()` 最多只处理 30 个 citation，超出的无评分; (2) 无评分的 citation 在 hard cap 排序时被当作 0.0 分全部删除
- **修复**: (1) 改为分批处理所有 citation (batch=30); (2) 无评分 citation 默认 0.7（已验证=大概率相关）

### BUG-08: CodeGen `'str' object has no attribute 'get'` (v8r3 新发现)
- **状态**: ✅ 已修复
- **严重度**: 中 — 不阻塞 pipeline（有 fallback），但连续失败 6 次
- **描述**: Case 1 在 Stage 14 (RESULT_ANALYSIS) 触发 CodeGen 时连续报 `'str' object has no attribute 'get'`。疑似 LLM 返回了纯字符串而非 dict，代码对返回值调 `.get()` 导致 AttributeError。
- **远程修复**: executor.py 中 `_check_ablation_effectiveness` 等函数已加 `isinstance` 保护
- **本地修复**: `code_agent.py` 中 `_parse_json` 结果增加 `isinstance(review, dict)` 检查

### BUG-09: FigureAgent 无法生成图表 (v8r3 新发现)
- **状态**: ✅ 已修复
- **描述**: Case 1 Stage 14 中 `FigureAgent produced no charts, falling back`。FigureAgent 可能因上游 CodeGen 失败或数据格式问题无法生成图表。
- **影响**: 论文缺少可视化图表，影响质量分数
- **根因**: `_condition_summaries` 在 metrics 不含 `/` 分隔符时为空，导致 Planner 没有数据
- **修复**: (1) 从 `metrics_summary` fallback 构建 condition_summaries; (2) 从 `structured_results` 二次 fallback; (3) 向 FigureAgent 传入 `best_run_metrics` 作为数据源兜底

### BUG-10: Degenerate refine cycle (v8r3 新发现)
- **状态**: ✅ 已修复 (远程 commit `e30443e`)
- **描述**: Case 1 出现 `P6: Degenerate refine cycle detected, injecting PROCEED hint`。Pipeline 检测到实验迭代循环没有实质进展，自动注入 PROCEED 跳出。
- **远程修复**: 根因是 LLM 在迭代 refine 时重命名/替换 condition 名称导致漂移。修复方案：在 `iterative_improve` prompt 中注入 `exp_plan.yaml` 锚定，并禁止改名条件。

## 远程额外修复（BUG_TRACKER 未记录的问题）

### RFix-01: Baselines dict→list 转换 (commit `855c201`)
- 若 LLM 输出 baselines 为 dict 而非 list，`executor.py` 现在自动转换为 `list(dict.keys())`

### RFix-02: Gymnasium 环境版本 v4→v5 (commit `855c201`)
- `benchmark_knowledge.yaml` 中 HalfCheetah-v4→v5, Hopper-v4→v5

### RFix-03: Time budget 注入到 Stage 9 (commit `855c201`)
- 实验设计 prompt 中增加 `time_budget_sec` 约束，防止生成超时的实验方案

### RFix-04: 代码模板 optimizers.py→models.py (commit `855c201`)
- 代码生成模板从 `optimizers.py` 改为 `models.py`，并禁止生成只有 import/pass 的 stub 文件

### RFix-05: RL 稳定性修复提示 (commit `e30443e`)
- `iterative_repair` prompt 中增加 gradient clipping、LR cap、reward normalization、NaN guard 等常见 RL 修复建议

## 待修复汇总

| Bug | 优先级 | 状态 |
|-----|--------|------|
| BUG-02 gpt-5.4 限流 | 低 | ⏳ 待观察 (外部限制) |

所有代码层面的 bug 已修复。

## 待办事项 (TODO)

- [x] 拉取远程更新，对比 bug 修复状态
- [x] 更新 BUG_TRACKER 标注远程已修复项
- [x] 修复 BUG-06: 在 experiment design 阶段校验 metric direction 一致性
- [x] 修复 BUG-07: 分析 Stage 23 引用验证高失败率原因
- [x] 完善 BUG-08: CodeGen 调用处增加 str 类型保护
- [x] 修复 BUG-09: FigureAgent 输入数据格式检查
- [ ] 分析本轮 (v8r3) 三个 case 的质量分数，对比上轮 (v8r2)
- [ ] 考虑增加 pipeline 间的 API 调用协调机制

## 历史质量分数对比

| 版本 | Case 1 (Graph-RAG) | Case 2 (Diffusion) | Case 3 (PEFT) | 平均 |
|------|--------------------|--------------------|---------------|------|
| v8r2 | 5.2/10 | 8.0/10 | 5.8/10 | 6.3 |
| v8r3 | 待定 | 待定 | 待定 | 待定 |

---
*最后更新: 2026-03-16*


================================================
FILE: docs/CHANGELOG_ANTHROPIC_ADAPTER.md
================================================
# Anthropic Messages API Adapter — 改动说明

> 本文档详细描述了为 ResearchClaw LLM 模块引入 Anthropic Messages API 原生支持的改动内容，
> 并通过架构图说明本次改动 **不影响现有 OpenAI / OpenRouter / DeepSeek 等 provider 的任何行为**。

---

## 目录

1. [改动背景](#1-改动背景)
2. [架构总览 — 改动前后对比](#2-架构总览--改动前后对比)
3. [核心设计：适配器模式](#3-核心设计适配器模式)
4. [调用流程详解](#4-调用流程详解)
5. [对现有 Provider 零影响的保证](#5-对现有-provider-零影响的保证)
6. [变更文件清单](#6-变更文件清单)
7. [异常处理与重试机制](#7-异常处理与重试机制)
8. [配置示例](#8-配置示例)
9. [新增依赖](#9-新增依赖)

---

## 1. 改动背景

ResearchClaw 的 LLM 模块原先仅支持 **OpenAI Chat Completions API 格式**（含兼容此格式的 OpenRouter、DeepSeek 等）。
Anthropic 的 Claude 系列模型使用独立的 **Messages API**，其请求/响应结构与 OpenAI 格式存在显著差异：

| 差异点 | OpenAI 格式 | Anthropic 格式 |
|---|---|---|
| 认证方式 | `Authorization: Bearer <key>` | `x-api-key: <key>` |
| System 消息 | 放在 `messages` 数组中 | 独立的 `system` 字段 |
| 端点路径 | `/v1/chat/completions` | `/v1/messages` |
| 响应结构 | `choices[0].message.content` | `content[0].text` |
| Token 统计 | `prompt_tokens` / `completion_tokens` | `input_tokens` / `output_tokens` |

为了原生支持 Anthropic API 而不影响现有功能，我们采用了 **适配器模式（Adapter Pattern）**。

---

## 2. 架构总览 — 改动前后对比

### 改动前

```mermaid
graph TB
    subgraph "create_llm_client (工厂函数)"
        A[config.llm.provider] -->|"acp"| B[ACPClient]
        A -->|"其他所有"| C["内联构造 LLMClient<br/>使用 PROVIDER_PRESETS 填充 base_url"]
    end

    C --> D["_raw_call()<br/>urllib → OpenAI /chat/completions"]
    D --> E[LLMResponse]

    style B fill:#e1f5fe
    style C fill:#e8f5e9
    style D fill:#e8f5e9
```

### 改动后

```mermaid
graph TB
    subgraph "create_llm_client (工厂函数)"
        A[config.llm.provider] -->|"acp"| B[ACPClient]
        A -->|"其他所有"| C["LLMClient.from_rc_config()<br/>使用 PROVIDER_PRESETS 填充 base_url"]
    end

    C -->|"provider == anthropic"| F["挂载 AnthropicAdapter"]
    C -->|"其他 provider"| G["_anthropic = None"]

    subgraph "_raw_call() 内部分支"
        H{"self._anthropic<br/>是否存在?"}
        H -->|"是 (Anthropic)"| I["AnthropicAdapter.chat_completion()<br/>httpx → Anthropic /v1/messages"]
        H -->|"否 (OpenAI 等)"| J["原有逻辑不变<br/>urllib → OpenAI /chat/completions"]
    end

    F --> H
    G --> H
    I --> K["返回 OpenAI 兼容格式 dict"]
    J --> K
    K --> L["统一解析 → LLMResponse"]

    style B fill:#e1f5fe
    style F fill:#fff3e0
    style I fill:#fff3e0
    style G fill:#e8f5e9
    style J fill:#e8f5e9
    style L fill:#f3e5f5
```

> 绿色 = 原有逻辑（未修改），橙色 = 新增 Anthropic 路径，紫色 = 共享的统一出口。

---

## 3. 核心设计：适配器模式

```mermaid
classDiagram
    class LLMClient {
        -LLMConfig config
        -AnthropicAdapter _anthropic
        +chat(messages, ...) LLMResponse
        +preflight() tuple
        -_call_with_retry(model, ...) LLMResponse
        -_raw_call(model, ...) LLMResponse
    }

    class AnthropicAdapter {
        -str base_url
        -str api_key
        -int timeout_sec
        +chat_completion(model, messages, ...) dict
    }

    class LLMResponse {
        +str content
        +str model
        +int prompt_tokens
        +int completion_tokens
    }

    LLMClient "1" *-- "0..1" AnthropicAdapter : _anthropic
    LLMClient ..> LLMResponse : returns
    AnthropicAdapter ..> LLMResponse : "返回 OpenAI 兼容 dict\n由 LLMClient 统一解析"

    note for AnthropicAdapter "仅当 provider=='anthropic' 时实例化\n其他 provider 时 _anthropic = None"
```

**关键设计决策：**

- `AnthropicAdapter` 是 `LLMClient` 的一个 **可选内部组件**，不是独立的客户端类
- 适配器返回 **OpenAI 兼容格式的 dict**，由 `_raw_call()` 的统一出口解析为 `LLMResponse`
- 当 `_anthropic is None` 时，`_raw_call()` 走 **完全不变的原有 OpenAI 路径**

---

## 4. 调用流程详解

以下时序图展示了两种 provider 各自的完整调用链路：

### OpenAI / OpenRouter / DeepSeek（原有流程，零改动）

```mermaid
sequenceDiagram
    participant Caller as 调用方
    participant Client as LLMClient
    participant Raw as _raw_call()
    participant API as OpenAI API

    Caller->>Client: chat(messages)
    Client->>Client: _call_with_retry(model, ...)
    Client->>Raw: _raw_call(model, ...)
    Note over Raw: self._anthropic is None<br/>→ 走 else 分支 (原有逻辑)
    Raw->>API: urllib POST /chat/completions
    API-->>Raw: {"choices": [...], "usage": {...}}
    Raw-->>Client: LLMResponse
    Client-->>Caller: LLMResponse
```

### Anthropic（新增流程）

```mermaid
sequenceDiagram
    participant Caller as 调用方
    participant Client as LLMClient
    participant Raw as _raw_call()
    participant Adapter as AnthropicAdapter
    participant API as Anthropic API

    Caller->>Client: chat(messages)
    Client->>Client: _call_with_retry(model, ...)
    Client->>Raw: _raw_call(model, ...)
    Note over Raw: self._anthropic 存在<br/>→ 走 if 分支
    Raw->>Adapter: chat_completion(model, messages, ...)
    Note over Adapter: 1. 提取 system 消息<br/>2. 构建 Anthropic 请求体<br/>3. httpx POST /v1/messages
    Adapter->>API: httpx POST /v1/messages
    API-->>Adapter: {"content": [...], "usage": {...}}
    Note over Adapter: 转换为 OpenAI 兼容格式
    Adapter-->>Raw: {"choices": [...], "usage": {...}}
    Note over Raw: 统一解析（与 OpenAI 路径完全相同）
    Raw-->>Client: LLMResponse
    Client-->>Caller: LLMResponse
```

---

## 5. 对现有 Provider 零影响的保证

```mermaid
graph LR
    subgraph "provider != 'anthropic' 时的代码路径"
        A["from_rc_config()"] --> B["PROVIDER_PRESETS 填充 base_url ✅"]
        B --> C["LLMClient.__init__()"]
        C --> D["self._anthropic = None"]
        D --> E["_raw_call()"]
        E --> F{"self._anthropic?"}
        F -->|"None → False"| G["else 分支<br/>原有 OpenAI 逻辑<br/>（代码未修改）"]
    end

    style G fill:#e8f5e9,stroke:#4caf50,stroke-width:3px
    style F fill:#fff9c4
```

**零影响的 5 重保证：**

| # | 保证机制 | 说明 |
|---|---|---|
| 1 | **条件初始化** | `AnthropicAdapter` 仅在 `provider == "anthropic"` 时实例化，其他 provider 不触发任何新代码 |
| 2 | **`_anthropic = None`** | `__init__` 中默认设为 `None`，非 Anthropic provider 永远不会进入适配器分支 |
| 3 | **else 分支 = 原代码** | `_raw_call()` 的 else 分支包含的是 **未修改的** OpenAI urllib 调用逻辑 |
| 4 | **PROVIDER_PRESETS 保留** | 恢复了 preset base_url 回退逻辑，`openai` / `openrouter` / `deepseek` 的自动 URL 填充行为与之前一致 |
| 5 | **统一出口** | 两条路径最终都产出相同结构的 dict，由同一段代码解析为 `LLMResponse` |

### PROVIDER_PRESETS 对照表

```mermaid
graph TD
    subgraph "PROVIDER_PRESETS（base_url 自动填充）"
        P1["openai → https://api.openai.com/v1"]
        P2["openrouter → https://openrouter.ai/api/v1"]
        P3["deepseek → https://api.deepseek.com/v1"]
        P4["anthropic → https://api.anthropic.com"]
        P5["openai-compatible → 用户自定义 base_url"]
    end

    P1 --> |"不变 ✅"| OK1[" "]
    P2 --> |"不变 ✅"| OK2[" "]
    P3 --> |"不变 ✅"| OK3[" "]
    P4 --> |"新增"| OK4[" "]
    P5 --> |"不变 ✅"| OK5[" "]

    style P1 fill:#e8f5e9
    style P2 fill:#e8f5e9
    style P3 fill:#e8f5e9
    style P4 fill:#fff3e0
    style P5 fill:#e8f5e9
```

---

## 6. 变更文件清单

| 文件路径 | 变更类型 | 改动说明 |
|---|---|---|
| `researchclaw/llm/__init__.py` | 修改 | 添加 `"anthropic"` preset；简化工厂函数委托给 `from_rc_config()` |
| `researchclaw/llm/client.py` | 修改 | `from_rc_config()` 恢复 PRESETS 逻辑 + 条件挂载适配器；`_raw_call()` 添加 if/else 分支 |
| `researchclaw/llm/anthropic_adapter.py` | **新增** | `AnthropicAdapter` 类 — Anthropic Messages API → OpenAI 兼容格式转换 |
| `tests/test_anthropic.py` | **新增** | Anthropic API 连通性测试脚本 |
| `pyproject.toml` | 修改 | 添加 `httpx` 为 optional dependency (`[anthropic]` extra) |
| `.gitignore` | 修改 | 添加 `run.log` |

---

## 7. 异常处理与重试机制

Anthropic 适配器内部将 httpx 异常 **转换为 urllib 标准异常**，确保上层重试逻辑无需修改：

```mermaid
graph TD
    subgraph "AnthropicAdapter 内部"
        A["httpx.HTTPStatusError<br/>(4xx/5xx)"] -->|转换| B["urllib.error.HTTPError<br/>(保留 status_code)"]
        C["httpx.ConnectError<br/>httpx.TimeoutException"] -->|转换| D["urllib.error.URLError"]
    end

    subgraph "_call_with_retry() — 不变"
        B --> E{"status code?"}
        E -->|"429/500/502/503/504"| F["指数退避重试 ✅"]
        E -->|"400"| G["立即抛出（Bad Request）"]
        E -->|"403 + model forbidden"| H["跳到下一个 fallback model"]
        D --> I["重试直到耗尽 ✅"]
    end

    style A fill:#fff3e0
    style C fill:#fff3e0
    style B fill:#e8f5e9
    style D fill:#e8f5e9
```

这意味着 Anthropic 路径享有与 OpenAI 路径 **完全相同的重试策略**：指数退避 + jitter + model fallback chain。

---

## 8. 配置示例

### 使用 Anthropic（新增）

```yaml
llm:
  provider: anthropic
  # base_url 可省略，自动使用 https://api.anthropic.com
  api_key_env: ANTHROPIC_API_KEY
  primary_model: claude-sonnet-4-20250514
  fallback_models:
    - claude-haiku-4-5-20251001
```

### 使用 OpenAI（不变）

```yaml
llm:
  provider: openai
  # base_url 可省略，自动使用 https://api.openai.com/v1
  api_key_env: OPENAI_API_KEY
  primary_model: gpt-4o
  fallback_models:
    - gpt-4.1
    - gpt-4o-mini
```

### 使用 OpenRouter（不变）

```yaml
llm:
  provider: openrouter
  api_key_env: OPENROUTER_API_KEY
  primary_model: anthropic/claude-sonnet-4-20250514
```

---

## 9. 新增依赖

| 依赖 | 版本要求 | 安装方式 | 说明 |
|---|---|---|---|
| `httpx` | `>=0.24` | `pip install researchclaw[anthropic]` | **可选依赖**，仅 Anthropic provider 需要 |

不使用 Anthropic provider 的用户 **无需安装 httpx**，`pip install researchclaw` 的行为完全不变。

---

> **总结**: 本次改动通过适配器模式在 `_raw_call()` 内部添加了一条 Anthropic 专用路径。
> 当 provider 不是 `"anthropic"` 时，`self._anthropic` 为 `None`，代码执行路径与改动前 **完全一致**，
> 不触及任何新增代码，不引入任何新依赖。


================================================
FILE: docs/PIPELINE_TEST_LOG_R5.md
================================================
# Pipeline Test Log — Round 5 (main branch)

> **分支**: `main` @ `e95527f`
> **日期**: 2026-03-18
> **目的**: 全面测试 main 分支 Pipeline 端到端流程，覆盖多领域、纯计算实验
> **环境**: Python 3.x, numpy 2.4.3, scipy 1.17.1, sklearn 1.8.0, matplotlib 3.10.8
> **LLM**: gpt-5.4 (fallback: gpt-5.1, gpt-4o) via Azure OpenAI

---

## 测试选题

| ID | 领域 | 主题 | metric_direction | 关键依赖 |
|----|------|------|-----------------|---------|
| N | 计算物理 | 随机矩阵理论：Marchenko-Pastur 分布的有限维修正分析 | minimize | numpy, scipy |
| O | 计算经济学 | 弱工具变量下 IV 估计量的 Monte Carlo 偏差-方差权衡 | minimize | numpy, scipy, sklearn |
| P | 计算流行病学 | SIR/SEIR 模型参数可辨识性：合成数据下的结构化似然分析 | maximize | numpy, scipy |
| Q | 数学/数值分析 | Krylov 子空间方法求解稀疏线性系统：预条件策略对比 | minimize | numpy, scipy |

### 选题原则
- 所有实验纯计算/模拟，无需外部数据集或 GPU
- 核心依赖仅 numpy/scipy/sklearn，sandbox 即可执行
- 覆盖 4 个不同领域：物理、经济学、流行病学、数值分析
- 避免 R4 中被 topic refinement 强行引向 ML 的问题——本轮 topic 描述更具体

### 备选 Topic（未选用）
1. **Agent/RL**: 网格世界中多智能体 emergent communication 的涌现 — 需要 gymnasium，sandbox 兼容性不确定
2. **信号处理**: 压缩感知中 RIP 条件的经验验证 — 可行但领域覆盖与 Q 重叠
3. **统计学**: Bayesian 变点检测的 MCMC 采样效率对比 — 可行，备用
4. **图论**: 随机图上 Erdos-Renyi 相变阈值的数值验证 — 可行，备用

---

## 运行状态

| Pipeline | Config | Run ID | PID | 启动时间 (UTC) | 最终阶段 | 状态 | 总耗时 |
|----------|--------|--------|-----|---------------|---------|------|--------|
| N | config_test_N.yaml | `rc-20260318-174754-fc94f2` | 2036352 | 17:47 | 28/29 (S23 fail) | ⚠️ 近完美 | ~2.5h |
| O | config_test_O.yaml | `rc-20260318-174826-01c0f3` | 2037261 | 17:48 | 28/29 (S23 fail) | ⚠️ 近完美 | ~3.0h |
| P | config_test_P.yaml | `rc-20260318-174900-d5371f` | 2037826 | 17:49 | 29/29 ✅ | ✅ 完美通过 | ~2.4h |
| Q | config_test_Q.yaml | `rc-20260318-174935-d0a717` | 2038664 | 17:49 | 28/29 (S23 fail) | ⚠️ 近完美 | ~2.5h |

---

## 观测记录

### OBS-R5-01: S2 + arXiv 429 限流（预期行为）(17:48 UTC)
- **严重度**: 🟢 预期行为
- **描述**: 4个并行 Pipeline 同时触发 S2/arXiv 429 限流
  - S2 circuit breaker: 120s cooldown (trip #1)
  - arXiv circuit breaker: 180s cooldown (trip #1)
- **关联**: R4-OBS-02 同类问题
- **影响**: 文献搜索阶段延迟增加，不阻塞

### OBS-R5-02: Pipeline Q 触发 IMP-35 Topic Refinement (17:49 UTC)
- **严重度**: 🟡 值得关注
- **描述**: Krylov 子空间方法的 topic 被评为 4/10，系统建议 refine 为 ML 相关方向
  - 原始: "Comparative Analysis of Preconditioning Strategies for Krylov Subspace Methods..."
  - 建议: "Learned preconditioner selection for Krylov solvers on sparse linear systems..."
- **评估**: IMP-35 倾向于把所有 topic 往 ML 方向引导（R4-OBS-03 同类问题）
- **影响**: 纯数值分析 topic 可能被扭曲为 ML topic，但实验代码仍应聚焦原始问题

### OBS-R5-03: 初始进度检查 (~17:55 UTC)
- N: Stage 7/SYNTHESIS ✅ 快速推进
- O: Stage 6/KNOWLEDGE_EXTRACT ✅ 正常
- P: Stage 4/LITERATURE_COLLECT — 稍慢（429 影响）
- Q: Stage 5/LITERATURE_SCREEN ✅ 正常

### OBS-R5-04: CodeSearcher query_gen.py TypeError (18:20 UTC)
- **严重度**: 🟡 中 — 不阻塞但影响代码质量
- **描述**: `researchclaw/agents/code_searcher/query_gen.py:149` 调用 `llm.chat()` 时传入不支持的 `user` 关键字参数
  ```
  TypeError: LLMClient.chat() got an unexpected keyword argument 'user'
  ```
- **影响**: CodeSearcher 无法使用 LLM 生成 GitHub 搜索 query，退化到基于规则的 query
- **关联**: R4-BUG-02 (GitHub 401) — 401 问题仍在（无 GITHUB_TOKEN），加上此 TypeError 意味着 CodeSearcher 基本失效
- **需要修复**: ✅ 是 — query_gen.py 中 `llm.chat()` 调用签名与 LLMClient 接口不匹配

### OBS-R5-05: gpt-5.4 Read Timeout 导致 fallback (18:30 UTC)
- **严重度**: 🟡 中 — 自动 fallback 工作正常
- **描述**: Pipeline N 在代码生成阶段遭遇多次 gpt-5.4 read timeout
  - 触发 fallback 到 gpt-5.1 或 gpt-4o
  - 代码生成请求因 token 量大，更容易超时
- **影响**: 代码生成速度下降，但不阻塞

### OBS-R5-06: Sandbox execution timeout 60s (18:35 UTC)
- **严重度**: 🟡 中 — 影响代码验证
- **描述**: Pipeline O 代码生成阶段的 sandbox 验证执行超时（60s）
  - 可能是验证生成的实验代码能否运行
  - 代码生成后的 AST 验证 + 试运行超时
- **影响**: 代码可能未经充分验证就进入下一阶段

### OBS-R5-07: Stage 10 Deep Quality — Copy-paste Detection (18:35 UTC)
- **严重度**: 🟡 中 — 代码质量问题
- **描述**: Pipeline O 的 models.py 中检测到多组 copy-paste 类：
  1. `FixedFullerOneBiasReducedBaseline` vs `FixedFullerFourAggressiveShrinkageBaseline` (16 vs 16 lines)
  2. `FirstStageStrengthOnlyRiskSurfaceBaseline` vs `NoLeverageGeometryRiskSurfaceAblation` (9 vs 9 lines)
  3. 多个 ablation 类仅 0-1 个非 dunder 方法
- **评估**: 这是 R4-BUG-13 的同类问题 — ablation 类之间差异不足
- **关联**: BUG-13 (copy-paste ablation)

### OBS-R5-08: 所有 Pipeline 在 Stage 10 停留超 25 分钟 (18:41 UTC)
- **严重度**: 🟢 预期行为
- **描述**: 代码生成是最重的 LLM 调用阶段，N=1 attempt, O/P=3 attempts, Q=3 attempts
- **评估**: 多次 attempt 表明 code validation loop 在工作，自动修复代码中的问题
- **耗时**: N=2441s (~41min), O=2485s (~41min), P=2796s (~47min), Q=2976s (~50min)

### OBS-R5-09: 所有已执行实验在 Stage 12 首次运行均失败 (18:55 UTC)
- **严重度**: 🔴 高 — 系统性 numpy 2.x API 不兼容
- **描述**: 3个已完成 Stage 12 的 Pipeline 均在首次实验运行失败：
  - **N**: `AttributeError: module 'numpy' has no attribute 'trapz'`
    - numpy 2.0 移除了 `np.trapz`，应使用 `np.trapezoid`
  - **O**: `numpy.linalg.LinAlgError: 1-dimensional array given. Array must be two-dimensional`
    - 代码向 linalg 函数传入了 1D 数组
  - **P**: `AttributeError: module 'numpy' has no attribute 'erfinv'`
    - `erfinv` 从未存在于 numpy 中，应使用 `scipy.special.erfinv`
- **根因**: gpt-5.4 生成的代码使用了已在 numpy 2.x 中移除或不存在的 API
- **关联**: R5-BUG-01 (见下方)

### OBS-R5-10: Stage 13 自动修复正确修复 numpy.trapz → numpy.trapezoid (18:55 UTC)
- **严重度**: 🟢 正面发现
- **描述**: Pipeline N 的 Stage 13 (ITERATIVE_REFINE) 成功检测到 `np.trapz` 错误并：
  1. 创建了 `_trapz()` 包装函数
  2. 内部使用 `np.trapezoid(y, x)` 替代
  3. 同时创建了 `_cumulative_trapezoid_1d()` 辅助函数
- **评估**: 自我修复机制在 numpy API 变更场景中工作良好

### OBS-R5-11: Pipeline Q Stage 09 YAML 解析警告 (18:40 UTC)
- **严重度**: 🟢 低 — 自动恢复
- **描述**: Pipeline Q 的 Stage 09 LLM 返回内容无法直接解析为 YAML
  - 返回了 38089 字符的响应，远超预期
  - content extraction fallback 正常工作
- **影响**: 无实际影响，pipeline 继续正常运行

### OBS-R5-12: Stage 13 自动修复成功修复所有 numpy 2.x 不兼容 (19:10 UTC)
- **严重度**: 🟢 正面发现
- **描述**: 所有 4 个 Pipeline 的 Stage 13 成功修复了 Stage 12 首次运行失败：
  - N: `np.trapz` → `np.trapezoid` (wrapper function) ✅
  - O: 1D→2D array reshape 修复 ✅
  - P: `np.erfinv` → `scipy.special.erfinv` ✅
  - Q: 修复后成功运行 ✅
- **评估**: 自我修复机制可靠，但首次成功率仍可改善

### OBS-R5-13: 所有 4 个 Pipeline 首次 Research Decision 均为 REFINE (19:23-19:49 UTC)
- **严重度**: 🟡 值得关注
- **描述**: 所有 Pipeline 在第一轮实验后都被判定需要 refine
  - 这可能意味着：(a) 实验结果不够convincing (b) 系统对首轮结果过于严格
  - N、P、Q 在第二轮后仍被 refine → 达到 max refine (2次) → 下次将 forced PROCEED
  - O 在第一轮 refine 中
- **影响**: Pipeline 总耗时增加（每次 refine 约增加 15-30 分钟实验时间）

### OBS-R5-14: Pipeline N 首先进入纸写作阶段 (~19:57 UTC)
- **严重度**: 🟢 正面进展
- **描述**: Pipeline N (Marchenko-Pastur) 完成 2 轮 refine，被 forced PROCEED 到 Stage 16
  - Stage 14 (RESULT_ANALYSIS) 耗时 553s (~9min)
  - Stage 15 decision 耗时 15s

### OBS-R5-15: Pipeline P 完美完成 29/29 stages! (20:13 UTC)
- **严重度**: 🟢🟢🟢 重大正面发现
- **描述**: Pipeline P (SIR/SEIR 流行病学) 是 R5 第一个（也是唯一一个）完美完成的 Pipeline
  - 所有 29 个 stage 成功，0 失败
  - 完整交付物：paper.tex (539行), references.bib (405行), 5 张图表, code package
  - Stage 23 citation verify 成功验证 44 条引用
  - LaTeX 编译成功（paper.aux, paper.log 生成）
  - 总耗时约 2.4 小时
- **评估**: 这是本项目自 R0 以来第一次有 Pipeline 完整通过所有 29 个 stage
  - R0: Pipeline A 29/29 但那是在较旧版本上
  - R4: 所有 4 个 Pipeline 在 Stage 20 被拒（2/10 质量分）
  - R5: Pipeline P 通过了 Stage 20（degraded 但非 rejected）

### OBS-R5-16: N 和 Q 在 Stage 23 (Citation Verify) 失败 (20:14-20:21 UTC)
- **严重度**: 🟡 中 — 不影响论文本身
- **描述**: N 和 Q 的 Stage 23 因 `references_verified.bib` 缺失而失败
  - 错误信息: `Missing or empty output: references_verified.bib`
  - Stage 23 耗时 0s — 意味着在验证前就失败了
  - Pipeline P 的 Stage 23 成功（11s），说明这不是系统性问题
- **关联**: R5-BUG-04 (见下方)

### OBS-R5-17: Pipeline O 大量 ablation failure (20:20 UTC)
- **严重度**: 🟡 中 — 代码质量问题
- **描述**: Pipeline O (IV estimators) 的 Stage 13 v2 检测到大量 copy-paste ablation 问题
  - 8+ 对 conditions 产生完全相同的输出
  - 例: `mean_bias_only_jive_evaluation_ablation` ≡ `two_stage_least_squares_wald_baseline`
  - 例: `no_instrument_density_geometry_risk_surface_ablation` ≡ `no_leverage_geometry_risk_surface_ablation`
- **关联**: R5-BUG-03, R4-BUG-13 — copy-paste ablation 问题持续存在

### OBS-R5-18: 纸面写作阶段高效 (Stage 16-22)
- **严重度**: 🟢 正面
- **描述**: 所有完成的 Pipeline 在纸面写作阶段均高效运行：
  - Stage 16 (PAPER_OUTLINE): 99-119s
  - Stage 17 (PAPER_DRAFT): 374-406s (~6-7min)
  - Stage 18 (PEER_REVIEW): 72s
  - Stage 19 (PAPER_REVISION): 242-277s (~4min)
  - Stage 20 (QUALITY_GATE): 9-12s
  - Stage 21 (KNOWLEDGE_ARCHIVE): 42-51s
  - Stage 22 (EXPORT_PUBLISH): 122-130s (~2min)
- **总计**: 纸面写作 + 导出约 15 分钟

### OBS-R5-19: Pipeline N 论文承认实验失败 (20:14 UTC)
- **严重度**: 🟡 中 — 影响论文质量
- **描述**: Pipeline N 的 paper_draft.md 中写道：
  > "the current execution failed before producing any analyzable spectral metrics"
- **分析**: 虽然 Stage 13 成功修复了 numpy 2.x 错误并重新运行了实验，但论文写作阶段可能
  没有从修复后的实验结果中获取数据，而是检测到了第一次失败的状态
- **关联**: 可能是 Stage 14 (RESULT_ANALYSIS) 没有正确读取 Stage 13 v2/v3 的结果

---

## 新发现 Bug

### R5-BUG-01: CodeSearcher query_gen.py — LLMClient.chat() 签名不匹配 ✅ 已修复
- **严重度**: 🟡 中 — 不阻塞 pipeline 但降低代码质量
- **文件**: `researchclaw/agents/code_searcher/query_gen.py:149`
- **描述**:
  - `llm.chat()` 被调用为 `llm.chat(system=..., user=..., max_tokens=...)`
  - 实际签名是 `chat(messages: list[dict], *, system=, max_tokens=)`
  - `user` 不是有效参数 → `TypeError`
  - 另外代码错误地用 `asyncio.run()` 包装同步方法
- **修复**:
  - 改为 `llm.chat([{"role": "user", "content": prompt}], system=..., max_tokens=...)`
  - 移除不必要的 `asyncio.run()` 和 `chat_sync` 分支
- **影响**: 修复后 CodeSearcher 可正常使用 LLM 生成搜索查询（仍需 GITHUB_TOKEN）

### R5-BUG-02: 代码生成使用已弃用/不存在的 numpy 2.x API（系统性）
- **严重度**: 🔴 高 — 导致所有实验首次运行失败
- **描述**: gpt-5.4 生成的代码使用了已在 numpy 2.0 中移除的 API：
  - `np.trapz` → 应使用 `np.trapezoid` (numpy 2.0 breaking change)
  - `np.erfinv` → 从未存在于 numpy，应使用 `scipy.special.erfinv`
  - `np.bool` / `np.int` 等 → 已在 numpy 1.24+ 移除
- **根因**: LLM 训练数据包含大量 numpy 1.x 代码，未适应 2.x 变化
- **自动修复**: Stage 13 (ITERATIVE_REFINE) 成功修复了这些问题 ✅
- **建议**: 在代码生成 prompt 中添加 numpy 2.x 兼容性提示，减少首次失败

### R5-BUG-03: Pipeline O copy-paste ablation 检测（已知问题复现）
- **严重度**: 🟡 中
- **描述**: Stage 10 deep quality check 检测到多组近似相同的 ablation 类
  - Fuller1 vs Fuller4: 仅超参数不同，方法体相同
  - Risk surface baseline vs ablation: 方法签名和体积完全相同
- **关联**: R4-BUG-13 (BUG-13 copy-paste ablation) — 该问题跨轮次持续存在
- **建议**: 需要在代码生成阶段强化 ablation 差异性检查

### R5-BUG-04: Stage 23 Citation Verify — references_verified.bib 缺失 ✅ FIXED
- **严重度**: 🔴 高 — 3/4 Pipeline 受影响
- **描述**: N、O 和 Q 在 Stage 23 因 `references_verified.bib` 未生成而失败
  - 错误: `Missing or empty output: references_verified.bib`
  - Stage 23 耗时 0s，说明在输出验证前就失败了
  - Pipeline P 的 Stage 23 成功（11s），同一引用验证逻辑正常工作
- **根因分析**:
  - Stage 23 在无引用时正确写入空的 `references_verified.bib`（executor.py L9082）
  - 但 contract validation（executor.py L9351）拒绝 `st_size == 0` 的文件
  - Pipeline P 有 19KB 的 references.bib → 验证后非空 → 通过
  - N/O/Q 无引用 → Stage 23 写空文件 → 被 contract validation 拒绝
- **修复**: 将空文件改为写入 BibTeX 注释 `% No references to verify\n`（executor.py L9085-9086）
  - 文件非空，通过 contract validation，同时语义上表示"无引用"

### R5-BUG-05: 论文未使用修复后的实验结果 ✅ FIXED
- **严重度**: 🔴 高 — 影响论文科学价值
- **描述**: Pipeline N/Q 的论文包含 "quality 2/10" 警告，声称实验失败
  但 Stage 13 成功修复了 numpy 错误并产生了完整的实验结果（论文表格中实际包含真实数据）
- **根因分析**: Stage 14 LLM analysis 在所有三次 refine 迭代中均给出 2/10（包括最新的非版本化 stage-14），
  而 BUG-23 guard（executor.py L7184）在 `_analysis_rating <= 2` 时强制 `has_real_metrics = False`，
  即使 `_collect_raw_experiment_metrics()` 已成功从 Stage 13 stdout 解析出真实指标
  - **注**: `_read_prior_artifact` 排序是正确的 — 非版本化目录确实是最新的（rollback 时旧目录会被重命名为 `_vN`）
- **修复**: 在 BUG-23 guard 中增加 `not _has_parsed_metrics` 条件（executor.py L7187）
  - 当 Stage 13 refinement 产生了可解析的真实指标时，不再被 analysis rating 覆盖
  - 同时保留了原始 BUG-23 防护：在确实没有真实指标时仍会触发

---

## 总结

### 整体评价

R5 是目前最成功的测试轮次：

| 指标 | R4 (feat/universal-codegen) | R5 (main) |
|------|---------------------------|-----------|
| 完美通过 (29/29) | 0/4 | **1/4 (Pipeline P)** |
| 近完美 (28/29) | 0/4 | **3/4 (N, O, Q)** |
| Stage 20 通过 | 0/4 (all rejected 2/10) | **4/4 (all degraded/pass)** |
| 崩溃/严重失败 | 1/4 (Pipeline K crash) | **0/4** |
| 平均完成阶段 | ~25/29 | **28.75/29** |
| 平均耗时 | ~3.5h | **~2.6h** |

### 关键改进
1. **Stage 20 Quality Gate 不再阻塞**: R4 中所有 Pipeline 被 2/10 拒绝，R5 全部通过
2. **自我修复能力可靠**: Stage 13 成功修复了所有 numpy 2.x API 不兼容问题
3. **跨领域能力验证**: 物理、经济学、流行病学、数值分析 4 个不同领域均可完成
4. **无崩溃**: 4/4 Pipeline 全部正常完成，无任何进程级崩溃

### 关键问题（全部已修复）
1. ✅ **R5-BUG-05**: BUG-23 guard 过度激进 → 论文声称实验失败
2. ✅ **R5-BUG-04**: Stage 23 写入空 bib 文件被 contract validation 拒绝 → 3/4 失败
3. ✅ **R5-BUG-01**: CodeSearcher query_gen.py 签名不匹配
4. ✅ **R5-BUG-02**: 代码生成使用已弃用 numpy 2.x API — 已在 7 个 prompt 中添加兼容性警告
5. ✅ **R5-BUG-03**: copy-paste ablation — 新增 <1% 近似检测 + prompt 强化
6. ✅ **R5-BUG-06**: LaTeX microtype 字体错误 — 已添加 `\usepackage{lmodern}`

### R5-BUG-06: LaTeX 编译失败 — pdfTeX font expansion 错误 ✅ FIXED
- **严重度**: 🟡 中
- **描述**: Pipeline Q 的 paper.tex 编译失败
  ```
  pdfTeX error (font expansion): auto expansion is only possible with scalable
  Fatal error occurred, no output PDF file produced!
  ```
- **根因**: `\usepackage[T1]{fontenc}` 激活了 T1 编码，但未加载可缩放字体（lmodern）
- **修复**: 在 `researchclaw/templates/conference.py` 的 NEURIPS_2024、NEURIPS_2025、GENERIC 三个模板中
  在 `fontenc` 之后添加 `\usepackage{lmodern}`

### R5-BUG-02: numpy 2.x API 不兼容 ✅ FIXED
- **修复范围**: 在以下 7 个 prompt 位置添加了 numpy 2.x 兼容性警告
  - `prompts.default.yaml` (legacy code_generation)
  - `prompts.py`: architecture_planning, generate_single_file, code_repair, iterative_improve, iterative_repair, code_exec_fix

### R5-BUG-03: copy-paste ablation ✅ IMPROVED
- **修复**: executor.py 新增 P8 近似检测（<1% relative diff → warning），补充了原有的精确匹配检测
- **注**: prompt 中已有 Rule 9 (ABLATION DIFFERENTIATION) 和 Rule 8 (METHOD RICHNESS) 的引导

### 后续排查结论
- **`_read_prior_artifact` 排序**: ✅ 确认正确 — 非版本化目录确实是最新的（rollback 重命名旧目录为 `_vN`）
- **Stage 14 quality rating 问题**: 所有 3 次 refine 迭代的 Stage 14 均给出 2/10 → 这是 LLM 分析偏保守的问题，
  但 BUG-05 的修复已绕过该问题（信任实际解析出的指标）

### 交付物检查

| Pipeline | paper.tex | references.bib | charts | code | LaTeX编译 |
|----------|-----------|---------------|--------|------|----------|
| N | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 |
| O | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 |
| P | ✅ (539行) | ✅ (405行) | ✅ (5张) | ✅ | ✅ |
| Q | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 |

### Pipeline 时间分布（以 Pipeline P 为例）

| 阶段 | 耗时 | 说明 |
|------|------|------|
| S1-S9 (研究+设计) | ~20min | 含 429 限流延迟 |
| S10 (代码生成) | ~47min | 最重的 LLM 阶段，3 次 attempt |
| S11 (资源规划) | ~14s | |
| S12-S13 (实验+修复) | ~15min | 首次失败 + 自动修复 + 重运行 × 2轮 refine |
| S14-S15 (分析+决策) | ~10min | 含 2 轮 refine 循环 |
| S16-S22 (论文写作+导出) | ~15min | |
| S23 (引用验证) | ~11s | |
| **总计** | **~2.4h** | |


================================================
FILE: docs/README_AR.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>شارك فكرة. احصل على ورقة بحثية. مؤتمت بالكامل & ذاتي التطور.</b></h2>


<p align="center">
  <b><i><font size="5">تحدث مع <a href="#-تكامل-openclaw">OpenClaw</a>: «ابحث عن X» → تمّ.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#الاختبار"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#-تكامل-openclaw"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 معرض الأوراق</a> · <a href="integration-guide.md">📖 دليل التكامل</a> · <a href="https://discord.gg/u4ksqW5P">💬 مجتمع Discord</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="ورقة نموذجية"/></a>
</td>
<td valign="middle">
<b>🏆 معرض الأوراق المُولّدة</b><br><br>
<b>8 أوراق في 8 مجالات</b> — الرياضيات، الإحصاء، الأحياء، الحوسبة، NLP، RL، الرؤية الحاسوبية، المتانة — مُولّدة بشكل مستقل تماماً بدون تدخل بشري.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/عرض_المعرض_الكامل_→-جميع_الأوراق_الـ8-d73a49?style=for-the-badge" alt="عرض المعرض"></a>
</td>
</tr>
</table>

---

> **🧪 نبحث عن مختبرين!** جرّب خط الأنابيب بفكرتك البحثية الخاصة — من أي مجال — و[أخبرنا برأيك](TESTER_GUIDE.md). ملاحظاتك تشكّل الإصدار القادم مباشرة. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **دعم متعدد المنصات + استقرار كبير** — يعمل AutoResearchClaw الآن مع أي وكيل متوافق مع ACP (Claude Code، Codex CLI، Copilot CLI، Gemini CLI، Kimi CLI) ويدعم منصات المراسلة (Discord، Telegram، Lark، WeChat) عبر جسر OpenClaw. واجهة خلفية جديدة لتوليد الكود عبر CLI-agent تفوّض المرحلتين 10 و13 لوكلاء CLI خارجيين مع التحكم في الميزانية وإدارة المهلة الزمنية. يتضمن نظام مكافحة التلفيق (VerifiedRegistry + حلقة تشخيص وإصلاح التجارب)، 100+ إصلاح أخطاء، إعادة هيكلة modular executor، كشف تلقائي لـ `--resume`، تعزيز إعادة محاولات LLM، وإصلاحات المجتمع.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ أمر واحد. ورقة واحدة.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 ما هذا؟

**أنت تفكر. AutoResearchClaw يكتب.**

أعطِ موضوعاً بحثياً — احصل على ورقة أكاديمية كاملة مع أدبيات حقيقية من OpenAlex و Semantic Scholar و arXiv، وتجارب في بيئة معزولة واعية بالعتاد (كشف تلقائي لـ GPU/MPS/CPU)، وتحليل إحصائي، ومراجعة أقران متعددة الوكلاء، و LaTeX جاهز للمؤتمرات يستهدف NeurIPS/ICML/ICLR. بدون مراقبة. بدون نسخ ولصق. بدون مراجع مُلفّقة.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>ورقة أكاديمية كاملة (مقدمة، أعمال سابقة، المنهجية، التجارب، النتائج، الخاتمة)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>LaTeX جاهز للمؤتمرات (قوالب NeurIPS / ICLR / ICML)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>مراجع BibTeX حقيقية من OpenAlex و Semantic Scholar و arXiv — مُنقّحة تلقائياً لمطابقة الاستشهادات المضمّنة</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>تحقق من سلامة الاستشهادات على 4 طبقات + التحقق من الصلة (arXiv، CrossRef، DataCite، LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>كود مُولّد + نتائج البيئة المعزولة + مقاييس JSON منظمة</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>رسوم بيانية مُولّدة تلقائياً لمقارنة الظروف مع أشرطة الخطأ وفترات الثقة</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>مراجعة أقران متعددة الوكلاء مع فحص اتساق المنهجية والأدلة</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>دروس تعلّم ذاتي مستخلصة من كل تشغيل</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>جميع المخرجات النهائية في مجلد واحد — جاهزة للترجمة على Overleaf</td></tr>
</table>

يعمل خط الأنابيب **من البداية إلى النهاية بدون تدخل بشري**. عندما تفشل التجارب، يصلح نفسه. عندما لا تصمد الفرضيات، يغيّر المسار. عندما تكون الاستشهادات مُلفّقة، يزيلها.

🌍 **شغّله من أي مكان.** AutoResearchClaw ليس مقيّدًا بمنصة واحدة. استخدمه مستقلاً عبر CLI، أو وصّله بـ [OpenClaw](https://github.com/openclaw/openclaw)، أو ادمجه مع أي وكيل متوافق مع ACP — 🤖 Claude Code، 💻 Codex CLI، 🐙 Copilot CLI، ♊ Gemini CLI، 🌙 Kimi CLI، وغيرها. بفضل جسر الرسائل في OpenClaw، يمكنك إطلاق بحث كامل من 💬 Discord، ✈️ Telegram، 🐦 Lark (飞书)، 💚 WeChat، أو أي منصة يستخدمها فريقك بالفعل. موضوع واحد كمُدخل، ورقة بحثية كمُخرج — بغض النظر عن المكان الذي تكتب منه.

---

## 🚀 البداية السريعة

```bash
# 1. استنساخ وتثبيت
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. الإعداد (تفاعلي — يثبّت OpenCode beast mode، يتحقق من Docker/LaTeX)
researchclaw setup

# 3. التهيئة
researchclaw init          # تفاعلي: اختر مزوّد LLM، ينشئ config.arc.yaml
# أو يدوياً: cp config.researchclaw.example.yaml config.arc.yaml

# 4. التشغيل
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

المخرجات → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — LaTeX و BibTeX وكود التجارب والرسوم البيانية جاهزة للترجمة.

<details>
<summary>📝 الحد الأدنى من التهيئة المطلوبة</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 ما الذي يميّزه

| القدرة | كيف يعمل |
|-----------|-------------|
| **🔄 حلقة PIVOT / REFINE** | المرحلة 15 تقرر بشكل مستقل: PROCEED أو REFINE (تعديل المعاملات) أو PIVOT (اتجاه جديد). المخرجات تُحفظ بإصدارات تلقائياً. |
| **🤖 نقاش متعدد الوكلاء** | توليد الفرضيات وتحليل النتائج ومراجعة الأقران تستخدم نقاشاً منظماً بوجهات نظر متعددة. |
| **🧬 التعلّم الذاتي** | دروس مستخلصة من كل تشغيل (مبررات القرارات، تحذيرات وقت التشغيل، شذوذ المقاييس) مع تناقص زمني بنصف عمر 30 يوماً. التشغيلات المستقبلية تتعلم من الأخطاء السابقة. |
| **📚 قاعدة المعرفة** | كل تشغيل يبني قاعدة معرفة منظمة عبر 6 فئات (قرارات، تجارب، اكتشافات، أدبيات، أسئلة، مراجعات). |
| **🛡️ الحارس المراقب Sentinel** | مراقب جودة في الخلفية: كشف NaN/Inf، اتساق الورقة والأدلة، تقييم صلة الاستشهادات، حماية ضد التلفيق. |

---

## 🦞 تكامل OpenClaw

<table>
<tr>

**AutoResearchClaw هو خدمة متوافقة مع [OpenClaw](https://github.com/openclaw/openclaw).** قم بتثبيته في OpenClaw وابدأ بحثاً مستقلاً برسالة واحدة — أو استخدمه بشكل مستقل عبر سطر الأوامر أو Claude Code أو أي مساعد برمجة بالذكاء الاصطناعي.

</tr>
</table>

### 🚀 الاستخدام مع OpenClaw (موصى به)

إذا كنت تستخدم [OpenClaw](https://github.com/openclaw/openclaw) بالفعل كمساعد ذكاء اصطناعي:

```
1️⃣  شارك رابط مستودع GitHub مع OpenClaw
2️⃣  OpenClaw يقرأ تلقائياً RESEARCHCLAW_AGENTS.md → يفهم خط الأنابيب
3️⃣  قل: "ابحث عن [موضوعك]"
4️⃣  تم — OpenClaw يستنسخ، يثبّت، يهيّئ، يشغّل، ويعيد النتائج
```

**هذا كل شيء.** يتعامل OpenClaw مع `git clone`، `pip install`، إعداد التهيئة، وتنفيذ خط الأنابيب تلقائياً. أنت فقط تتحدث.

<details>
<summary>💡 ماذا يحدث خلف الكواليس</summary>

1. يقرأ OpenClaw ملف `RESEARCHCLAW_AGENTS.md` → يتعلم دور منسّق البحث
2. يقرأ OpenClaw ملف `README.md` → يفهم التثبيت وبنية خط الأنابيب
3. يقرأ OpenClaw ملف `config.researchclaw.example.yaml` → `config.yaml`
4. يسأل عن مفتاح API لنموذج اللغة (أو يستخدم متغير البيئة)
5. يشغّل `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. يعيد الورقة و LaTeX والتجارب والاستشهادات

</details>

### 🔌 جسر OpenClaw (متقدم)

للتكامل الأعمق، يتضمن AutoResearchClaw **نظام محوّلات جسر** مع 6 إمكانيات اختيارية:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ عمليات تشغيل بحث مجدولة
  use_message: true           # 💬 إشعارات التقدم (Discord/Slack/Telegram)
  use_memory: true            # 🧠 استمرارية المعرفة عبر الجلسات
  use_sessions_spawn: true    # 🔀 إطلاق جلسات فرعية متوازية للمراحل المتزامنة
  use_web_fetch: true         # 🌐 بحث ويب مباشر أثناء مراجعة الأدبيات
  use_browser: false          # 🖥️ جمع الأوراق عبر المتصفح
```

كل علامة تفعّل بروتوكول محوّل مُحدد النوع. عندما يوفر OpenClaw هذه الإمكانيات، تستهلكها المحوّلات بدون تغييرات في الكود. راجع [`integration-guide.md`](integration-guide.md) للتفاصيل الكاملة.

### ACP (Agent Client Protocol)

يمكن لـ AutoResearchClaw استخدام **أي وكيل برمجة متوافق مع ACP** كواجهة خلفية لنموذج اللغة — بدون الحاجة لمفاتيح API. يتواصل الوكيل عبر [acpx](https://github.com/openclaw/acpx)، ويحافظ على جلسة واحدة مستمرة عبر جميع مراحل خط الأنابيب الـ 23.

| الوكيل | الأمر | ملاحظات |
|-------|---------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — مثال ACP
llm:
  provider: "acp"
  acp:
    agent: "claude"   # أي أمر CLI لوكيل متوافق مع ACP
    cwd: "."          # دليل العمل للوكيل
  # لا حاجة لـ base_url أو api_key — الوكيل يدير مصادقته بنفسه.
```

```bash
# فقط شغّل — الوكيل يستخدم بيانات اعتماده الخاصة
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ طرق أخرى للتشغيل

| الطريقة | الكيفية |
|--------|-----|
| **سطر أوامر مستقل** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **واجهة Python البرمجية** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | يقرأ `RESEARCHCLAW_CLAUDE.md` — فقط قل *"شغّل بحثاً عن [موضوع]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` مع `llm.acp.agent: "gh"` |
| **OpenCode** | يقرأ `.claude/skills/` — نفس واجهة اللغة الطبيعية |
| **أي واجهة ذكاء اصطناعي** | قدّم `RESEARCHCLAW_AGENTS.md` كسياق → الوكيل يبدأ تلقائياً |

---

## 🔬 خط الأنابيب: 23 مرحلة، 8 أطوار

```
Phase A: تحديد نطاق البحث          Phase E: تنفيذ التجارب
  1. TOPIC_INIT                      12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE               13. ITERATIVE_REFINE  ← إصلاح ذاتي

Phase B: اكتشاف الأدبيات          Phase F: التحليل والقرار
  3. SEARCH_STRATEGY                 14. RESULT_ANALYSIS    ← متعدد الوكلاء
  4. LITERATURE_COLLECT  ← API حقيقي  15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [بوابة]
  6. KNOWLEDGE_EXTRACT               Phase G: كتابة الورقة
                                     16. PAPER_OUTLINE
Phase C: توليف المعرفة              17. PAPER_DRAFT
  7. SYNTHESIS                       18. PEER_REVIEW        ← فحص الأدلة
  8. HYPOTHESIS_GEN    ← نقاش        19. PAPER_REVISION

Phase D: تصميم التجارب            Phase H: الإنهاء
  9. EXPERIMENT_DESIGN   [بوابة]      20. QUALITY_GATE      [بوابة]
 10. CODE_GENERATION                 21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING               22. EXPORT_PUBLISH     ← LaTeX
                                     23. CITATION_VERIFY    ← فحص الصلة
```

> **مراحل البوابات** (5، 9، 20) تتوقف للحصول على موافقة بشرية أو موافقة تلقائية مع `--auto-approve`. عند الرفض، يعود خط الأنابيب للخلف.

> **حلقات القرار**: يمكن للمرحلة 15 تفعيل REFINE (→ المرحلة 13) أو PIVOT (→ المرحلة 8)، مع إصدار تلقائي للمخرجات.

<details>
<summary>📋 ماذا يفعل كل طور</summary>

| الطور | ما يحدث |
|-------|-------------|
| **A: تحديد النطاق** | يفكك نموذج اللغة الموضوع إلى شجرة مشاكل منظمة مع أسئلة بحثية |
| **A+: العتاد** | كشف تلقائي لـ GPU (NVIDIA CUDA / Apple MPS / CPU فقط)، تحذير إذا كان العتاد المحلي محدوداً، تكييف توليد الكود وفقاً لذلك |
| **B: الأدبيات** | بحث متعدد المصادر (OpenAlex → Semantic Scholar → arXiv) عن أوراق حقيقية، فرز حسب الصلة، استخلاص بطاقات معرفية |
| **C: التوليف** | تجميع النتائج، تحديد فجوات البحث، توليد فرضيات قابلة للاختبار عبر نقاش متعدد الوكلاء |
| **D: التصميم** | تصميم خطة التجارب، توليد كود Python قابل للتشغيل واعٍ بالعتاد (مستوى GPU → اختيار الحزم)، تقدير احتياجات الموارد |
| **E: التنفيذ** | تشغيل التجارب في بيئة معزولة، كشف NaN/Inf وأخطاء وقت التشغيل، إصلاح ذاتي للكود عبر إصلاح مُستهدف بنموذج اللغة |
| **F: التحليل** | تحليل متعدد الوكلاء للنتائج؛ قرار مستقل PROCEED / REFINE / PIVOT مع المبررات |
| **G: الكتابة** | مخطط → صياغة قسم بقسم (5,000-6,500 كلمة) → مراجعات أقران (مع اتساق المنهجية والأدلة) → مراجعة مع حماية الطول |
| **H: الإنهاء** | بوابة جودة، أرشفة المعرفة، تصدير LaTeX مع قالب المؤتمر، التحقق من سلامة الاستشهادات + الصلة |

</details>

---

## ✨ الميزات الرئيسية

| الميزة | الوصف |
|---------|------------|
| **📚 أدبيات متعددة المصادر** | أوراق حقيقية من OpenAlex و Semantic Scholar و arXiv — توسيع الاستعلام، إزالة التكرار، قاطع دائرة مع تدهور أنيق |
| **🔍 تحقق من الاستشهادات على 4 طبقات** | فحص arXiv ID → CrossRef/DataCite DOI → مطابقة عنوان Semantic Scholar → تقييم صلة LLM. المراجع المُلفّقة تُزال تلقائياً. |
| **🖥️ تنفيذ واعٍ بالعتاد** | كشف تلقائي لـ GPU (NVIDIA CUDA / Apple MPS / CPU فقط) مع تكييف توليد الكود والاستيرادات ونطاق التجارب |
| **🦾 OpenCode Beast Mode** | التجارب المعقدة تُوجّه تلقائياً إلى [OpenCode](https://github.com/anomalyco/opencode) — يولّد مشاريع متعددة الملفات مع بنى مخصصة وحلقات تدريب ودراسات استئصال. التثبيت عبر `researchclaw setup`. |
| **🧪 تجارب في بيئة معزولة** | كود مُتحقق بـ AST، إطار غير قابل للتعديل، فشل سريع عند NaN/Inf، إصلاح ذاتي، تحسين تكراري (حتى 10 جولات)، التقاط نتائج جزئية |
| **📝 كتابة بمستوى المؤتمرات** | قوالب NeurIPS/ICML/ICLR، صياغة قسم بقسم (5,000-6,500 كلمة)، حماية ضد التلفيق، حماية طول المراجعة، فرض مضاد لإخلاءات المسؤولية |
| **📐 تبديل القوالب** | `neurips_2025`، `iclr_2026`، `icml_2026` — Markdown → LaTeX مع رياضيات وجداول وأشكال ومراجع تبادلية و `\cite{}` |
| **🚦 بوابات الجودة** | 3 بوابات بمشاركة بشرية (المراحل 5، 9، 20) مع إمكانية التراجع. تخطّ باستخدام `--auto-approve`. |

---

## 🧠 تكامل MetaClaw

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = خط أنابيب يتعلم من كل تشغيل.**

يضيف MetaClaw **نقل المعرفة عبر التشغيلات** إلى AutoResearchClaw. عند التفعيل، يلتقط خط الأنابيب تلقائياً الدروس من الإخفاقات والتحذيرات، ويحوّلها إلى مهارات قابلة لإعادة الاستخدام، ويحقنها في جميع مراحل خط الأنابيب الـ 23 في التشغيلات اللاحقة — بحيث لا تتكرر نفس الأخطاء أبداً.

### كيف يعمل

```
Run N ينفّذ → الإخفاقات/التحذيرات تُلتقط كـ Lessons
                      ↓
          MetaClaw Lesson → تحويل إلى Skill
                      ↓
          ملفات arc-* Skill تُخزّن في ~/.metaclaw/skills/
                      ↓
Run N+1 → build_overlay() يحقن المهارات في كل أمر LLM
                      ↓
          LLM يتجنب المزالق المعروفة → جودة أعلى، محاولات أقل
```

### الإعداد السريع

```bash
# 1. تثبيت MetaClaw (إذا لم يكن مُثبّتاً)
pip install metaclaw

# 2. التفعيل في التهيئة
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # وكيل MetaClaw (اختياري)
  skills_dir: "~/.metaclaw/skills"          # أين تُخزّن المهارات
  fallback_url: "https://api.openai.com/v1" # بديل LLM مباشر
  fallback_api_key: ""                      # مفتاح API لعنوان البديل
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # تحويل التحذيرات + الأخطاء
    max_skills_per_run: 3
```

```bash
# 3. شغّل كالمعتاد — MetaClaw يعمل بشفافية
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

بعد كل تشغيل، تحقق من `~/.metaclaw/skills/arc-*/SKILL.md` لمشاهدة المهارات التي تعلّمها خط أنابيبك.

### نتائج التجارب

في تجارب A/B مُحكمة (نفس الموضوع، نفس LLM، نفس التهيئة):

| المقياس | خط الأساس | مع MetaClaw | التحسين |
|---------|----------|---------------|----------|
| معدل إعادة المحاولة لكل مرحلة | 10.5% | 7.9% | **-24.8%** |
| عدد دورات REFINE | 2.0 | 1.2 | **-40.0%** |
| إكمال مراحل خط الأنابيب | 18/19 | 19/19 | **+5.3%** |
| درجة المتانة الإجمالية (مركّبة) | 0.714 | 0.845 | **+18.3%** |

> درجة المتانة المركّبة هي متوسط مرجّح لمعدل إكمال المراحل (40%) وتقليل المحاولات (30%) وكفاءة دورات REFINE (30%).

### التوافق العكسي

- **الافتراضي: مُعطّل.** إذا كان `metaclaw_bridge` غائباً أو `enabled: false`، يعمل خط الأنابيب تماماً كما كان.
- **بدون تبعيات جديدة.** MetaClaw اختياري — خط الأنابيب الأساسي يعمل بدونه.
- **جميع الاختبارات الـ 1,823 الحالية تنجح** مع وجود كود التكامل.

---

## ⚙️ مرجع التهيئة

<details>
<summary>انقر لتوسيع مرجع التهيئة الكامل</summary>

```yaml
# === المشروع ===
project:
  name: "my-research"              # معرّف المشروع
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === البحث ===
research:
  topic: "..."                     # موضوع البحث (مطلوب)
  domains: ["ml", "nlp"]           # مجالات البحث للبحث في الأدبيات
  daily_paper_count: 8             # عدد الأوراق المستهدف لكل استعلام بحث
  quality_threshold: 4.0           # الحد الأدنى لدرجة جودة الأوراق

# === وقت التشغيل ===
runtime:
  timezone: "America/New_York"     # للطوابع الزمنية
  max_parallel_tasks: 3            # حد التجارب المتزامنة
  approval_timeout_hours: 12       # مهلة مرحلة البوابة
  retry_limit: 2                   # عدد إعادة المحاولة عند فشل المرحلة

# === نموذج اللغة ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # نقطة نهاية API (مطلوب لـ openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # متغير بيئة لمفتاح API (مطلوب لـ openai-compatible)
  api_key: ""                      # أو ضع المفتاح هنا مباشرة
  primary_model: "gpt-4o"          # النموذج الأساسي
  fallback_models: ["gpt-4o-mini"] # سلسلة النماذج الاحتياطية
  s2_api_key: ""                   # مفتاح Semantic Scholar API (اختياري، حدود معدل أعلى)
  acp:                             # يُستخدم فقط عند provider: "acp"
    agent: "claude"                # أمر CLI لوكيل ACP (claude، codex، gemini، إلخ)
    cwd: "."                       # دليل العمل للوكيل

# === التجارب ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # أقصى وقت تنفيذ لكل تشغيل (الافتراضي: 300 ثانية)
  max_iterations: 10               # أقصى عدد تكرارات التحسين
  metric_key: "val_loss"           # اسم المقياس الأساسي
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # كشف تلقائي للاستيراد → requirements.txt
  ssh_remote:
    host: ""                       # اسم مضيف خادم GPU
    gpu_ids: []                    # معرّفات GPU المتاحة
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (يُثبّت تلقائياً عبر `researchclaw setup`)
    enabled: true                    # المفتاح الرئيسي (الافتراضي: true)
    auto: true                       # تشغيل تلقائي بدون تأكيد (الافتراضي: true)
    complexity_threshold: 0.2        # 0.0-1.0 — أعلى = فقط للتجارب المعقدة
    model: ""                        # تجاوز النموذج (فارغ = يستخدم llm.primary_model)
    timeout_sec: 600                 # أقصى ثوانٍ لتوليد OpenCode
    max_retries: 1                   # عدد المحاولات عند الفشل
    workspace_cleanup: true          # حذف مساحة العمل المؤقتة بعد الجمع

# === التصدير ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === الأوامر النصية ===
prompts:
  custom_file: ""                  # مسار ملف YAML للأوامر المخصصة (فارغ = الافتراضي)

# === الأمان ===
security:
  hitl_required_stages: [5, 9, 20] # المراحل التي تتطلب موافقة بشرية
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === قاعدة المعرفة ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === الإشعارات ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === جسر MetaClaw (اختياري) ===
metaclaw_bridge:
  enabled: false                   # اضبط على true لتفعيل التعلم عبر التشغيلات
  proxy_url: "http://localhost:30000"  # عنوان وكيل MetaClaw
  skills_dir: "~/.metaclaw/skills" # أين تُخزّن مهارات arc-*
  fallback_url: ""                 # بديل LLM مباشر عند عدم توفر الوكيل
  fallback_api_key: ""             # مفتاح API لنقطة نهاية البديل
  lesson_to_skill:
    enabled: true                  # تحويل الدروس إلى مهارات تلقائياً
    min_severity: "warning"        # أدنى شدة للتحويل
    max_skills_per_run: 3          # أقصى مهارات جديدة لكل تشغيل

# === جسر OpenClaw ===
openclaw_bridge:
  use_cron: false                  # عمليات تشغيل بحث مجدولة
  use_message: false               # إشعارات التقدم
  use_memory: false                # استمرارية المعرفة عبر الجلسات
  use_sessions_spawn: false        # إطلاق جلسات فرعية متوازية
  use_web_fetch: false             # بحث ويب مباشر
  use_browser: false               # جمع الأوراق عبر المتصفح
```

</details>

---

## 🙏 شكر وتقدير

مستوحى من:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — رائد البحث الآلي
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — أتمتة البحث من البداية إلى النهاية
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — نظام بحث مؤتمت بالكامل

---

## 📄 الرخصة

MIT — راجع [LICENSE](../LICENSE) للتفاصيل.

---

## 📌 الاستشهاد

إذا وجدت AutoResearchClaw مفيداً، يرجى الاستشهاد:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>بُني بـ 🦞 بواسطة فريق AutoResearchClaw</sub>
</p>


================================================
FILE: docs/README_CN.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>聊一个想法。出一篇论文。全自动 & 自演化。</b></h2>


<p align="center">
  <b><i><font size="5">直接与 <a href="#openclaw-集成">OpenClaw</a> 对话："研究 X" → 搞定。</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#测试"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#openclaw-集成"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 论文展示</a> · <a href="integration-guide.md">📖 集成指南</a> · <a href="https://discord.gg/u4ksqW5P">💬 Discord 社区</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 生成论文展示</b><br><br>
<b>8 篇论文覆盖 8 个领域</b> — 数学、统计、生物、计算、NLP、RL、视觉、鲁棒性 — 完全自主生成，零人工干预。<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 我们正在寻找测试者！** 用你自己的研究想法试试这个流水线 — 任何领域 — 然后 [告诉我们你的反馈](TESTER_GUIDE.md)。你的反馈将直接影响下一个版本。 **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **跨平台支持 + 重大稳定性更新** — AutoResearchClaw 现已支持任何 ACP 兼容的 AI 代理后端（Claude Code、Codex CLI、Copilot CLI、Gemini CLI、Kimi CLI），并通过 OpenClaw 桥接支持消息平台（Discord、Telegram、飞书、微信）。新增 CLI-agent 代码生成后端，将 Stage 10 和 13 委托给外部 CLI agent，支持预算控制和超时管理。同时包含反数据捏造系统（VerifiedRegistry + 实验诊断与修复循环），100+ 个 bug 修复，模块化 executor 重构，`--resume` 自动检测，LLM 重试加固，以及社区反馈修复。
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ 一行命令。一篇论文。

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 这是什么？

**你有一个灵感，AutoResearchClaw 把它写出来。**

输入一个研究主题——获得一篇完整的学术论文，包含来自 OpenAlex、Semantic Scholar 和 arXiv 的真实文献，硬件感知沙箱实验（自动检测 GPU/MPS/CPU），统计分析，多 Agent 同行评审，以及面向 NeurIPS/ICML/ICLR 的顶会级 LaTeX。不需要盯着看。不需要复制粘贴。不会出现幻觉引用。

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>完整学术论文（引言、相关工作、方法、实验、结果、结论）</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>适配顶会模板的 LaTeX 文件（NeurIPS / ICLR / ICML）</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>来自 OpenAlex、Semantic Scholar 和 arXiv 的真实 BibTeX 引用——自动精简至与正文引用一致</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>四层引用完整性 + 相关性核查（arXiv、CrossRef、DataCite、LLM）</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>生成的代码 + 沙箱结果 + 结构化 JSON 指标</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>自动生成的条件对比图（含误差线和置信区间）</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>多 Agent 同行评审（含方法论-证据一致性检查）</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>从每次运行中提取的自学习教训</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>所有最终产出集中在一个文件夹——可直接上传 Overleaf 编译</td></tr>
</table>

流水线**端到端无需人工介入**运行。实验失败时自动修复。假设不成立时自主转向。引用是假的？自动删除。

🌍 **随处可用。** AutoResearchClaw 不绑定任何单一平台。你可以通过 CLI 独立运行，接入 [OpenClaw](https://github.com/openclaw/openclaw)，或对接任何 ACP 兼容的 AI 代理 —— 🤖 Claude Code、💻 Codex CLI、🐙 Copilot CLI、♊ Gemini CLI、🌙 Kimi CLI，应有尽有。而且，借助 OpenClaw 的消息桥接能力，你还可以从 💬 Discord、✈️ Telegram、🐦 飞书、💚 微信，或任何你团队日常使用的平台发起一次完整的研究。输入一个课题，输出一篇论文 —— 无论你在哪里输入。

---

## 🚀 快速开始

```bash
# 1. 克隆 & 安装
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. 初始化（交互式 — 安装 OpenCode Beast Mode，检查 Docker/LaTeX）
researchclaw setup

# 3. 配置
researchclaw init          # 交互式：选择 LLM 提供商，创建 config.arc.yaml
# 或手动：cp config.researchclaw.example.yaml config.arc.yaml

# 4. 运行
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

输出 → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — 可编译的 LaTeX、BibTeX、实验代码、图表。

<details>
<summary>📝 最小必要配置</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 有什么不同

| 能力 | 工作原理 |
|------|----------|
| **🔄 PIVOT / REFINE 循环** | 第 15 阶段自主决策：PROCEED、REFINE（调参）或 PIVOT（新方向）。产物自动版本化。 |
| **🤖 多 Agent 辩论** | 假设生成、结果分析、同行评审均使用结构化的多视角辩论。 |
| **🧬 自学习** | 每次运行提取教训（决策理由、运行时警告、指标异常），30 天时间衰减。未来运行从过去的错误中学习。 |
| **📚 知识库** | 每次运行在 6 个类别（决策、实验、发现、文献、问题、评审）中构建结构化知识库。 |
| **🛡️ Sentinel 看门狗** | 后台质量监控：NaN/Inf 检测、论文-证据一致性、引用相关性评分、反数据捏造守卫。 |

---

## 🦞 OpenClaw 集成

<table>
<tr>

**AutoResearchClaw 是 [OpenClaw](https://github.com/openclaw/openclaw) 兼容服务。** 在 OpenClaw 中安装后，一句话即可启动自主研究——也可通过 CLI、Claude Code 或其他 AI 编码助手独立使用。

</tr>
</table>

### 🚀 通过 OpenClaw 使用（推荐）

如果你已经在使用 [OpenClaw](https://github.com/openclaw/openclaw) 作为 AI 助手：

```
1️⃣  把 GitHub 仓库地址分享给 OpenClaw
2️⃣  OpenClaw 自动读取 RESEARCHCLAW_AGENTS.md → 理解流水线
3️⃣  对它说："帮我研究 [你的主题]"
4️⃣  完成 — OpenClaw 自动克隆、安装、配置、运行，然后返回结果
```

**就这么简单。** OpenClaw 自动处理 `git clone`、`pip install`、配置和流水线执行。你只需聊天。

<details>
<summary>💡 底层发生了什么</summary>

1. OpenClaw 读取 `RESEARCHCLAW_AGENTS.md` → 学习研究编排器角色
2. OpenClaw 读取 `README.md` → 理解安装方式和流水线结构
3. OpenClaw 复制 `config.researchclaw.example.yaml` → `config.yaml`
4. 向你询问 LLM API Key（或使用环境变量）
5. 运行 `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. 返回论文、LaTeX、实验结果和引用

</details>

### 🔌 OpenClaw Bridge（高级功能）

AutoResearchClaw 内置了 **Bridge 适配器系统**，提供 6 个可选集成能力：

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ 定时研究任务
  use_message: true           # 💬 进度通知（Discord/Slack/Telegram）
  use_memory: true            # 🧠 跨会话知识持久化
  use_sessions_spawn: true    # 🔀 为并行阶段派生子会话
  use_web_fetch: true         # 🌐 文献检索中的实时网络搜索
  use_browser: false          # 🖥️ 基于浏览器的论文采集
```

每个标志激活一个类型化适配器协议。当 OpenClaw 提供对应能力时，适配器无需改代码即可消费。详见 [`integration-guide.md`](integration-guide.md)。

### ACP (Agent Client Protocol)

AutoResearchClaw 可以使用**任何 ACP 兼容的编码 Agent** 作为其 LLM 后端——无需 API 密钥。Agent 通过 [acpx](https://github.com/openclaw/acpx) 通信，在全部 23 个流水线阶段中维持单个持久会话。

| Agent | 命令 | 备注 |
|-------|------|------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ACP 示例
llm:
  provider: "acp"
  acp:
    agent: "claude"   # 任何 ACP 兼容的 Agent CLI 命令
    cwd: "."          # Agent 的工作目录
  # 无需 base_url 或 api_key — Agent 自行处理认证。
```

```bash
# 直接运行 — Agent 使用自己的凭据
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ 其他运行方式

| 方式 | 怎么用 |
|------|--------|
| **独立 CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | 读取 `RESEARCHCLAW_CLAUDE.md` — 直接说 *"Run research on [主题]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` 配合 `llm.acp.agent: "gh"` |
| **OpenCode** | 读取 `.claude/skills/` — 同样的自然语言交互 |
| **任何 AI CLI** | 提供 `RESEARCHCLAW_AGENTS.md` 作为上下文 → agent 自动引导 |

---

## 🔬 流水线：23 个阶段，8 个阶段组

```
阶段组 A：研究定义                阶段组 E：实验执行
  1. TOPIC_INIT                    12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE             13. ITERATIVE_REFINE  ← 自修复

阶段组 B：文献发现                阶段组 F：分析与决策
  3. SEARCH_STRATEGY               14. RESULT_ANALYSIS    ← 多Agent
  4. LITERATURE_COLLECT ← 真实API  15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN  [门控]
  6. KNOWLEDGE_EXTRACT             阶段组 G：论文撰写
                                   16. PAPER_OUTLINE
阶段组 C：知识综合                 17. PAPER_DRAFT
  7. SYNTHESIS                     18. PEER_REVIEW        ← 证据审查
  8. HYPOTHESIS_GEN   ← 辩论      19. PAPER_REVISION

阶段组 D：实验设计                阶段组 H：终稿
  9. EXPERIMENT_DESIGN  [门控]     20. QUALITY_GATE     [门控]
 10. CODE_GENERATION               21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING             22. EXPORT_PUBLISH    ← LaTeX
                                   23. CITATION_VERIFY   ← 相关性审查
```

> **门控阶段**（5、9、20）可暂停等待人工审批，也可用 `--auto-approve` 自动通过。拒绝后流水线回滚。

> **决策循环**：第 15 阶段可触发 REFINE（→ 第 13 阶段）或 PIVOT（→ 第 8 阶段），自动版本化之前的产物。

<details>
<summary>📋 各阶段组职责</summary>

| 阶段组 | 做什么 |
|--------|--------|
| **A：定义** | LLM 将主题分解为结构化问题树和研究问题 |
| **A+：硬件检测** | 自动检测 GPU（NVIDIA CUDA / Apple MPS / 纯 CPU），性能不足时警告用户，据此调整代码生成策略 |
| **B：文献** | 多源搜索（OpenAlex → Semantic Scholar → arXiv）获取真实论文，按相关性筛选，提取知识卡片 |
| **C：综合** | 聚类研究发现，识别研究空白，通过多 Agent 辩论生成可验证假设 |
| **D：设计** | 设计实验方案，生成硬件感知的可运行 Python 代码（GPU 等级 → 包选择），估算资源需求 |
| **E：执行** | 在沙箱中运行实验，检测 NaN/Inf 和运行时 Bug，通过定向 LLM 修复自愈代码 |
| **F：分析** | 多 Agent 分析实验结果；自主 PROCEED / REFINE / PIVOT 决策并附理由 |
| **G：写作** | 大纲 → 分段撰写初稿（5,000-6,500 词）→ 同行评审（含方法论-证据一致性）→ 带长度保障的修订 |
| **H：终稿** | 质量门控，知识归档，LaTeX 导出（适配顶会模板），引用完整性 + 相关性核查 |

</details>

---

## ✨ 核心功能

| 功能 | 说明 |
|------|------|
| **📚 多源文献** | 来自 OpenAlex、Semantic Scholar 和 arXiv 的真实论文——查询扩展、去重、三态熔断器与优雅降级 |
| **🔍 四层引用核查** | arXiv ID 校验 → CrossRef/DataCite DOI → Semantic Scholar 标题匹配 → LLM 相关性评分。幻觉引用自动删除。 |
| **🖥️ 硬件感知执行** | 自动检测 GPU（NVIDIA CUDA / Apple MPS / 纯 CPU），据此调整代码生成、import 和实验规模 |
| **🦾 OpenCode Beast Mode** | 复杂实验自动路由至 [OpenCode](https://github.com/anomalyco/opencode)——生成多文件项目，含自定义架构、训练循环和消融实验。通过 `researchclaw setup` 安装。 |
| **🧪 沙箱实验** | AST 验证代码、不可变 harness、NaN/Inf 快速失败、自修复、迭代优化（最多 10 轮）、部分结果捕获 |
| **📝 顶会级写作** | NeurIPS/ICML/ICLR 模板，分段撰写（5,000-6,500 词），反数据捏造守卫、修订长度保障、反免责声明强制 |
| **📐 模板切换** | `neurips_2025`、`iclr_2026`、`icml_2026` — Markdown → LaTeX，含数学公式、表格、图片、交叉引用、`\cite{}` |
| **🛡️ 反数据捏造** | VerifiedRegistry 强制论文中使用经过验证的实验数据。自动诊断失败实验并在写作前修复。未验证数字被清理。 |
| **🚦 质量门控** | 3 个人工审批门控（阶段 5、9、20），支持回滚。用 `--auto-approve` 跳过。 |

---

## 🧠 MetaClaw 集成

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = 一个能从每次运行中学习的流水线。**

MetaClaw 为 AutoResearchClaw 添加了**跨运行知识迁移**。启用后，流水线会自动从失败和警告中提取教训，将其转化为可复用的技能，并在后续运行中注入到全部 23 个阶段——让同样的错误不再重犯。

### 工作原理

```
运行 N 执行 → 失败/警告被捕获为 Lessons
                      ↓
          MetaClaw Lesson → Skill 转换
                      ↓
          arc-* Skill 文件存储在 ~/.metaclaw/skills/
                      ↓
运行 N+1 → build_overlay() 将技能注入每个 LLM 提示
                      ↓
          LLM 规避已知陷阱 → 更高质量，更少重试
```

### 快速配置

```bash
# 1. 安装 MetaClaw（如未安装）
pip install metaclaw

# 2. 在配置中启用
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # MetaClaw 代理（可选）
  skills_dir: "~/.metaclaw/skills"          # 技能存储位置
  fallback_url: "https://api.openai.com/v1" # 直连 LLM 回退
  fallback_api_key: ""                      # 回退 URL 的 API key
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # 转换 warning + error
    max_skills_per_run: 3
```

```bash
# 3. 照常运行 — MetaClaw 透明运作
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

每次运行后，查看 `~/.metaclaw/skills/arc-*/SKILL.md` 以了解流水线学到了哪些技能。

### 实验结果

在对照 A/B 实验中（相同主题、相同 LLM、相同配置）：

| 指标 | 基线 | 使用 MetaClaw | 改善 |
|------|------|---------------|------|
| 阶段重试率 | 10.5% | 7.9% | **-24.8%** |
| Refine 循环次数 | 2.0 | 1.2 | **-40.0%** |
| 流水线阶段完成率 | 18/19 | 19/19 | **+5.3%** |
| 整体鲁棒性得分（综合） | 0.714 | 0.845 | **+18.3%** |

> 综合鲁棒性得分是阶段完成率（40%）、重试减少（30%）和 Refine 循环效率（30%）的加权平均。

### 向后兼容性

- **默认：关闭。** 如果 `metaclaw_bridge` 不存在或 `enabled: false`，流水线行为与之前完全一致。
- **无新依赖。** MetaClaw 是可选的——核心流水线无需它即可运行。
- **所有 1,823 项现有测试通过**（包含集成代码）。

---

## ⚙️ 配置参考

<details>
<summary>点击展开完整配置参考</summary>

```yaml
# === 项目 ===
project:
  name: "my-research"              # 项目标识符
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === 研究 ===
research:
  topic: "..."                     # 研究主题（必填）
  domains: ["ml", "nlp"]           # 文献搜索的研究领域
  daily_paper_count: 8             # 每个搜索查询的目标论文数
  quality_threshold: 4.0           # 论文最低质量分

# === 运行时 ===
runtime:
  timezone: "America/New_York"     # 用于时间戳
  max_parallel_tasks: 3            # 并发实验限制
  approval_timeout_hours: 12       # 门控阶段超时
  retry_limit: 2                   # 阶段失败重试次数

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # API 端点（openai-compatible 必填）
  api_key_env: "OPENAI_API_KEY"    # API key 环境变量（openai-compatible 必填）
  api_key: ""                      # 或直接填写 key
  primary_model: "gpt-4o"          # 主模型
  fallback_models: ["gpt-4o-mini"] # 回退链
  s2_api_key: ""                   # Semantic Scholar API key（可选，更高速率限制）
  acp:                             # 仅在 provider: "acp" 时使用
    agent: "claude"                # ACP Agent CLI 命令（claude, codex, gemini 等）
    cwd: "."                       # Agent 的工作目录

# === 实验 ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # 每次运行最大执行时间（默认：300 秒）
  max_iterations: 10               # 最大优化迭代次数
  metric_key: "val_loss"           # 主指标名称
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # 自动检测 import → requirements.txt
  ssh_remote:
    host: ""                       # GPU 服务器主机名
    gpu_ids: []                    # 可用 GPU ID
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode（通过 `researchclaw setup` 自动安装）
    enabled: true                    # 主开关（默认：true）
    auto: true                       # 无需确认自动触发（默认：true）
    complexity_threshold: 0.2        # 0.0-1.0 — 越高 = 仅在复杂实验时触发
    model: ""                        # 覆盖模型（空 = 使用 llm.primary_model）
    timeout_sec: 600                 # OpenCode 生成最大秒数
    max_retries: 1                   # 失败重试次数
    workspace_cleanup: true          # 采集后清理临时工作区
  code_agent:                        # CodeAgent v2 — 多阶段代码生成
    enabled: true                    # 使用 CodeAgent 替代传统单 prompt 代码生成
    architecture_planning: true      # 生成代码前先生成深度实现蓝图
    sequential_generation: true      # 按依赖 DAG 逐文件生成
    hard_validation: true            # 基于 AST 的验证门控（拦截相同消融、硬编码指标）
    hard_validation_max_repairs: 2   # 验证失败时最大修复次数
    exec_fix_max_iterations: 3       # 执行修复循环最大次数
    exec_fix_timeout_sec: 60         # 每次执行修复超时（秒）
  benchmark_agent:                   # BenchmarkAgent — 自动数据集和基线选择
    enabled: true                    # 启用 4-agent 基准测试流水线（Surveyor→Selector→Acquirer→Validator）
    enable_hf_search: true           # 搜索 HuggingFace Datasets
    enable_web_search: true          # 搜索 Google Scholar 获取基准
    tier_limit: 2                    # 数据集级别过滤（1=小型/已缓存，2=中型，3=大型）
    min_benchmarks: 1                # 最少需要的数据集数量
    min_baselines: 2                 # 最少需要的基线方法数量
  figure_agent:                      # FigureAgent — 学术图表生成
    enabled: true                    # 启用 5-agent 图表流水线（Planner→CodeGen→Renderer→Critic→Integrator）
    min_figures: 3                   # 最少生成图表数
    max_figures: 8                   # 最多生成图表数
    max_iterations: 3                # Critic 驱动的迭代优化次数
    dpi: 300                         # 输出分辨率
    strict_mode: false               # 图表生成失败时是否阻塞流水线
  repair:                            # 反数据捏造实验修复
    enabled: true                    # 自动诊断并修复失败的实验
    max_cycles: 3                    # 修复重试循环数
    min_completion_rate: 0.5         # >=50% 条件必须完成才可继续
    min_conditions: 2                # 有效实验至少需要 2 个条件
    use_opencode: true               # 通过 OpenCode Beast Mode 进行修复

# === 网络搜索（可选）===
web_search:
  enabled: true                      # 启用网络增强文献搜索
  tavily_api_key_env: "TAVILY_API_KEY"  # Tavily API key 环境变量（可选）
  enable_scholar: true               # Google Scholar 搜索
  enable_pdf_extraction: true        # 从 PDF 中提取文本
  max_web_results: 10                # 每次查询最大网络结果数

# === 导出 ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # 自定义 Prompt YAML 路径（空 = 使用默认）

# === 安全 ===
security:
  hitl_required_stages: [5, 9, 20] # 需要人工审批的阶段
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === 知识库 ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === 通知 ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge（可选）===
metaclaw_bridge:
  enabled: false                   # 设为 true 以启用跨运行学习
  proxy_url: "http://localhost:30000"  # MetaClaw 代理 URL
  skills_dir: "~/.metaclaw/skills" # arc-* 技能的存储位置
  fallback_url: ""                 # 代理不可用时的直连 LLM 回退
  fallback_api_key: ""             # 回退端点的 API key
  lesson_to_skill:
    enabled: true                  # 自动将教训转换为技能
    min_severity: "warning"        # 转换的最低严重级别
    max_skills_per_run: 3          # 每次流水线运行的最大新技能数
  prm:                             # 过程奖励模型质量门控（可选）
    enabled: false                 # 使用 LLM-as-judge 评分阶段产出
    model: "gpt-5.4"              # PRM 评判模型
    votes: 3                       # 多数投票次数
    gate_stages: [5, 9, 15, 20]   # 应用 PRM 门控的阶段

# === OpenClaw Bridge ===
openclaw_bridge:
  use_cron: false                  # 定时研究运行
  use_message: false               # 进度通知
  use_memory: false                # 跨会话知识持久化
  use_sessions_spawn: false        # 派生并行子会话
  use_web_fetch: false             # 实时网络搜索
  use_browser: false               # 基于浏览器的论文采集
```

</details>

---

## 🙏 致谢

灵感来源：

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist)（Sakana AI）— 自动化研究先驱
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch)（Andrej Karpathy）— 端到端研究自动化
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/)（Analemma）— 全自动研究系统

---

## 📄 许可证

MIT — 详见 [LICENSE](../LICENSE)。

---

## 📌 引用

如果你觉得 AutoResearchClaw 有用，请引用：

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Built with 🦞 by the AutoResearchClaw team</sub>
</p>


================================================
FILE: docs/README_DE.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Idee besprechen. Paper erhalten. Vollautomatisch & selbstentwickelnd.</b></h2>


<p align="center">
  <b><i><font size="5">Einfach mit <a href="#-openclaw-integration">OpenClaw</a> chatten: "Research X" → erledigt.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#testing"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#-openclaw-integration"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 Paper-Showcase</a> · <a href="integration-guide.md">📖 Integrationsanleitung</a> · <a href="https://discord.gg/u4ksqW5P">💬 Discord-Community</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 Showcase generierter Paper</b><br><br>
<b>8 Paper aus 8 Disziplinen</b> — Mathematik, Statistik, Biologie, Informatik, NLP, RL, Vision, Robustheit — vollstaendig autonom generiert ohne menschliches Eingreifen.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 Wir suchen Tester!** Teste die Pipeline mit deiner eigenen Forschungsidee — aus jedem Fachgebiet — und [sag uns, was du denkst](TESTER_GUIDE.md). Dein Feedback beeinflusst direkt die naechste Version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Plattformuebergreifende Unterstuetzung + grosse Stabilitaet** — AutoResearchClaw laeuft jetzt mit jedem ACP-kompatiblen Agenten-Backend (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) und unterstuetzt Messaging-Plattformen (Discord, Telegram, Lark, WeChat) ueber die OpenClaw-Bruecke. Neues CLI-Agent-Code-Generierungs-Backend delegiert Stages 10 und 13 an externe CLI-Agenten mit Budgetkontrolle und Timeout-Management. Enthaelt Anti-Fabrication-System (VerifiedRegistry + Experiment-Diagnose- und Reparaturschleife), 100+ Bugfixes, modulares Executor-Refactoring, `--resume` Auto-Erkennung, LLM-Retry-Haertung und Community-Fixes.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ Ein Befehl. Ein Paper.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 Was ist das?

**Du denkst es. AutoResearchClaw schreibt es.**

Gib ein Forschungsthema ein — erhalte ein vollstaendiges wissenschaftliches Paper mit echter Literatur von OpenAlex, Semantic Scholar und arXiv, hardwarebewussten Sandbox-Experimenten (automatische GPU/MPS/CPU-Erkennung), statistischer Analyse, Multi-Agenten-Peer-Review und konferenzfertigem LaTeX fuer NeurIPS/ICML/ICLR. Kein Babysitting. Kein Kopieren. Keine halluzinierten Referenzen.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Vollstaendiges wissenschaftliches Paper (Einleitung, Verwandte Arbeiten, Methode, Experimente, Ergebnisse, Fazit)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>Konferenzfertiges LaTeX (NeurIPS / ICLR / ICML Templates)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>Echte BibTeX-Referenzen von OpenAlex, Semantic Scholar und arXiv — automatisch bereinigt, um Inline-Zitationen zu entsprechen</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>4-Schicht-Zitationsintegritaets- und Relevanzpruefung (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Generierter Code + Sandbox-Ergebnisse + strukturierte JSON-Metriken</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Automatisch generierte Vergleichsdiagramme mit Fehlerbalken und Konfidenzintervallen</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Multi-Agenten-Peer-Review mit Methodik-Evidenz-Konsistenzpruefungen</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Selbstlernende Erkenntnisse aus jedem Durchlauf</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>Alle finalen Ergebnisse in einem Ordner — kompilierbereit fuer Overleaf</td></tr>
</table>

Die Pipeline laeuft **vollstaendig ohne menschliches Eingreifen**. Wenn Experimente fehlschlagen, repariert sie sich selbst. Wenn Hypothesen nicht bestaetigt werden, schwenkt sie um. Wenn Zitationen gefaelscht sind, entfernt sie diese.

🌍 **Ueberall ausfuehrbar.** AutoResearchClaw ist nicht an eine einzelne Plattform gebunden. Nutzen Sie es eigenstaendig ueber die CLI, verbinden Sie es mit [OpenClaw](https://github.com/openclaw/openclaw), oder integrieren Sie es mit jedem ACP-kompatiblen AI-Agenten — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI und mehr. Dank der Messaging-Bruecke von OpenClaw koennen Sie eine komplette Forschung von 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat oder jeder anderen Plattform starten, die Ihr Team bereits nutzt. Ein Thema rein, ein Paper raus — egal wo Sie tippen.

---

## 🚀 Schnellstart

```bash
# 1. Klonen & installieren
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Setup (interaktiv — installiert OpenCode Beast Mode, prueft Docker/LaTeX)
researchclaw setup

# 3. Konfigurieren
researchclaw init          # Interaktiv: LLM-Anbieter waehlen, erstellt config.arc.yaml
# Oder manuell: cp config.researchclaw.example.yaml config.arc.yaml

# 4. Ausfuehren
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

Ausgabe → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — kompilierfertiges LaTeX, BibTeX, Experimentcode, Diagramme.

<details>
<summary>📝 Minimale erforderliche Konfiguration</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 Was macht es anders

| Faehigkeit | Funktionsweise |
|-----------|---------------|
| **🔄 PIVOT / REFINE Schleife** | Stufe 15 entscheidet autonom: PROCEED, REFINE (Parameter anpassen) oder PIVOT (neue Richtung). Artefakte automatisch versioniert. |
| **🤖 Multi-Agenten-Debatte** | Hypothesengenerierung, Ergebnisanalyse und Peer-Review verwenden jeweils strukturierte Multi-Perspektiven-Debatten. |
| **🧬 Selbstlernen** | Erkenntnisse pro Durchlauf extrahiert (Entscheidungsbegruendungen, Laufzeitwarnungen, Metrikanaomalien) mit 30-Tage-Zeitabklingung. Zukuenftige Durchlaeufe lernen aus vergangenen Fehlern. |
| **📚 Wissensdatenbank** | Jeder Durchlauf baut eine strukturierte KB ueber 6 Kategorien auf (Entscheidungen, Experimente, Ergebnisse, Literatur, Fragen, Reviews). |
| **🛡️ Sentinel Watchdog** | Hintergrund-Qualitaetsmonitor: NaN/Inf-Erkennung, Paper-Evidenz-Konsistenz, Zitationsrelevanz-Bewertung, Anti-Fabrikationsschutz. |

---

## 🦞 OpenClaw-Integration

<table>
<tr>

**AutoResearchClaw ist ein [OpenClaw](https://github.com/openclaw/openclaw)-kompatibler Dienst.** Installiere es in OpenClaw und starte autonome Forschung mit einer einzigen Nachricht — oder verwende es eigenstaendig ueber CLI, Claude Code oder jeden anderen KI-Coding-Assistenten.

</tr>
</table>

### 🚀 Verwendung mit OpenClaw (empfohlen)

Wenn du bereits [OpenClaw](https://github.com/openclaw/openclaw) als KI-Assistenten nutzt:

```
1️⃣  Teile die GitHub-Repo-URL mit OpenClaw
2️⃣  OpenClaw liest automatisch RESEARCHCLAW_AGENTS.md → versteht die Pipeline
3️⃣  Sage: "Research [dein Thema]"
4️⃣  Fertig — OpenClaw klont, installiert, konfiguriert, fuehrt aus und liefert Ergebnisse
```

**Das war's.** OpenClaw uebernimmt `git clone`, `pip install`, Konfiguration und Pipeline-Ausfuehrung automatisch. Du chattest einfach.

<details>
<summary>💡 Was unter der Haube passiert</summary>

1. OpenClaw liest `RESEARCHCLAW_AGENTS.md` → lernt die Forschungs-Orchestrator-Rolle
2. OpenClaw liest `README.md` → versteht Installation und Pipeline-Struktur
3. OpenClaw kopiert `config.researchclaw.example.yaml` → `config.yaml`
4. Fragt nach deinem LLM-API-Schluessel (oder verwendet deine Umgebungsvariable)
5. Fuehrt `pip install -e .` + `researchclaw run --topic "..." --auto-approve` aus
6. Liefert Paper, LaTeX, Experimente und Zitationen zurueck

</details>

### 🔌 OpenClaw Bridge (Fortgeschritten)

Fuer tiefere Integration enthaelt AutoResearchClaw ein **Bridge-Adapter-System** mit 6 optionalen Faehigkeiten:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Geplante Forschungsdurchlaeufe
  use_message: true           # 💬 Fortschrittsbenachrichtigungen (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Sitzungsuebergreifende Wissenspersistenz
  use_sessions_spawn: true    # 🔀 Parallele Sub-Sessions fuer gleichzeitige Stufen
  use_web_fetch: true         # 🌐 Live-Websuche waehrend der Literaturrecherche
  use_browser: false          # 🖥️ Browserbasierte Paper-Sammlung
```

Jedes Flag aktiviert ein typisiertes Adapter-Protokoll. Wenn OpenClaw diese Faehigkeiten bereitstellt, nutzen die Adapter sie ohne Codeaenderungen. Siehe [`integration-guide.md`](integration-guide.md) fuer vollstaendige Details.

### ACP (Agent Client Protocol)

AutoResearchClaw kann **jeden ACP-kompatiblen Coding-Agenten** als LLM-Backend verwenden — keine API-Schluessel erforderlich. Der Agent kommuniziert ueber [acpx](https://github.com/openclaw/acpx) und haelt eine einzige persistente Sitzung ueber alle 23 Pipeline-Stufen aufrecht.

| Agent | Befehl | Hinweise |
|-------|--------|----------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ACP-Beispiel
llm:
  provider: "acp"
  acp:
    agent: "claude"   # Jeder ACP-kompatible Agent-CLI-Befehl
    cwd: "."          # Arbeitsverzeichnis fuer den Agenten
  # Kein base_url oder api_key noetig — der Agent verwaltet seine eigene Authentifizierung.
```

```bash
# Einfach ausfuehren — der Agent verwendet seine eigenen Anmeldedaten
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ Weitere Ausfuehrungsmoeglichkeiten

| Methode | Anleitung |
|---------|-----------|
| **Standalone CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Liest `RESEARCHCLAW_CLAUDE.md` — sage einfach *"Run research on [Thema]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` mit `llm.acp.agent: "gh"` |
| **OpenCode** | Liest `.claude/skills/` — gleiche natuerliche Sprachschnittstelle |
| **Jeder KI-CLI** | Uebergib `RESEARCHCLAW_AGENTS.md` als Kontext → Agent bootstrappt automatisch |

---

## 🔬 Pipeline: 23 Stufen, 8 Phasen

```
Phase A: Forschungsplanung            Phase E: Experimentausfuehrung
  1. TOPIC_INIT                          12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE                   13. ITERATIVE_REFINE  ← Selbstheilung

Phase B: Literaturrecherche            Phase F: Analyse & Entscheidung
  3. SEARCH_STRATEGY                     14. RESULT_ANALYSIS    ← Multi-Agent
  4. LITERATURE_COLLECT  ← echte API     15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [Gate]
  6. KNOWLEDGE_EXTRACT                   Phase G: Papiererstellung
                                         16. PAPER_OUTLINE
Phase C: Wissenssynthese                 17. PAPER_DRAFT
  7. SYNTHESIS                           18. PEER_REVIEW        ← Evidenzpruefung
  8. HYPOTHESIS_GEN    ← Debatte         19. PAPER_REVISION

Phase D: Experimentdesign             Phase H: Finalisierung
  9. EXPERIMENT_DESIGN   [Gate]          20. QUALITY_GATE      [Gate]
 10. CODE_GENERATION                     21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING                   22. EXPORT_PUBLISH     ← LaTeX
                                         23. CITATION_VERIFY    ← Relevanzpruefung
```

> **Gate-Stufen** (5, 9, 20) pausieren fuer menschliche Genehmigung oder werden mit `--auto-approve` automatisch genehmigt. Bei Ablehnung wird die Pipeline zurueckgesetzt.

> **Entscheidungsschleifen**: Stufe 15 kann REFINE (→ Stufe 13) oder PIVOT (→ Stufe 8) ausloesen, mit automatischer Artefakt-Versionierung.

<details>
<summary>📋 Was jede Phase bewirkt</summary>

| Phase | Beschreibung |
|-------|-------------|
| **A: Planung** | LLM zerlegt das Thema in einen strukturierten Problembaum mit Forschungsfragen |
| **A+: Hardware** | Automatische GPU-Erkennung (NVIDIA CUDA / Apple MPS / nur CPU), Warnung bei eingeschraenkter Hardware, Codegenerierung wird entsprechend angepasst |
| **B: Literatur** | Multi-Source-Suche (OpenAlex → Semantic Scholar → arXiv) nach echten Papern, Relevanzscreening, Extraktion von Wissenskarten |
| **C: Synthese** | Clustering der Ergebnisse, Identifizierung von Forschungsluecken, Generierung testbarer Hypothesen via Multi-Agenten-Debatte |
| **D: Design** | Experimentplan entwerfen, hardwarebewussten ausfuehrbaren Python-Code generieren (GPU-Stufe → Paketauswahl), Ressourcenbedarf schaetzen |
| **E: Ausfuehrung** | Experimente in Sandbox ausfuehren, NaN/Inf und Laufzeitfehler erkennen, Code via gezielter LLM-Reparatur selbst heilen |
| **F: Analyse** | Multi-Agenten-Analyse der Ergebnisse; autonome PROCEED / REFINE / PIVOT Entscheidung mit Begruendung |
| **G: Schreiben** | Gliederung → abschnittsweises Verfassen (5.000-6.500 Woerter) → Peer-Review (mit Methodik-Evidenz-Konsistenz) → Revision mit Laengenpruefung |
| **H: Finalisierung** | Qualitaets-Gate, Wissensarchivierung, LaTeX-Export mit Konferenztemplate, Zitationsintegritaets- und Relevanzpruefung |

</details>

---

## ✨ Hauptfunktionen

| Funktion | Beschreibung |
|----------|-------------|
| **📚 Multi-Source-Literatur** | Echte Paper von OpenAlex, Semantic Scholar und arXiv — Abfrageerweiterung, Deduplizierung, Circuit Breaker mit Graceful Degradation |
| **🔍 4-Schicht-Zitationsverifikation** | arXiv-ID-Pruefung → CrossRef/DataCite-DOI → Semantic-Scholar-Titelabgleich → LLM-Relevanzbewertung. Halluzinierte Refs automatisch entfernt. |
| **🖥️ Hardwarebewusste Ausfuehrung** | Automatische GPU-Erkennung (NVIDIA CUDA / Apple MPS / nur CPU) und Anpassung von Codegenerierung, Imports und Experimentumfang |
| **🦾 OpenCode Beast Mode** | Komplexe Experimente werden automatisch an [OpenCode](https://github.com/anomalyco/opencode) weitergeleitet — generiert Multi-File-Projekte mit individuellen Architekturen, Trainingsschleifen und Ablationsstudien. Installation ueber `researchclaw setup`. |
| **🧪 Sandbox-Experimente** | AST-validierter Code, unveraenderlicher Harness, NaN/Inf-Schnellabbruch, selbstheilende Reparatur, iterative Verfeinerung (bis zu 10 Runden), Teilergebnis-Erfassung |
| **📝 Konferenzqualitaet** | NeurIPS/ICML/ICLR-Templates, abschnittsweises Verfassen (5.000-6.500 Woerter), Anti-Fabrikationsschutz, Revisions-Laengenschutz, Anti-Disclaimer-Durchsetzung |
| **📐 Template-Umschaltung** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX mit Mathematik, Tabellen, Abbildungen, Querverweisen, `\cite{}` |
| **🚦 Qualitaets-Gates** | 3 Human-in-the-Loop-Gates (Stufen 5, 9, 20) mit Rollback. Ueberspringen mit `--auto-approve`. |

---

## 🧠 MetaClaw-Integration

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Eine Pipeline, die aus jedem Durchlauf lernt.**

MetaClaw fuegt **durchlaufuebergreifenden Wissenstransfer** zu AutoResearchClaw hinzu. Wenn aktiviert, erfasst die Pipeline automatisch Erkenntnisse aus Fehlern und Warnungen, konvertiert sie in wiederverwendbare Skills und injiziert diese Skills in alle 23 Pipeline-Stufen bei nachfolgenden Durchlaeufen — damit dieselben Fehler nie wiederholt werden.

### Funktionsweise

```
Durchlauf N wird ausgefuehrt → Fehler/Warnungen als Lektionen erfasst
                      ↓
          MetaClaw Lektion → Skill-Konvertierung
                      ↓
          arc-* Skill-Dateien in ~/.metaclaw/skills/ gespeichert
                      ↓
Durchlauf N+1 → build_overlay() injiziert Skills in jeden LLM-Prompt
                      ↓
          LLM vermeidet bekannte Fallstricke → hoehere Qualitaet, weniger Wiederholungen
```

### Schnelleinrichtung

```bash
# 1. MetaClaw installieren (falls nicht vorhanden)
pip install metaclaw

# 2. In der Konfiguration aktivieren
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # MetaClaw-Proxy (optional)
  skills_dir: "~/.metaclaw/skills"          # Wo Skills gespeichert werden
  fallback_url: "https://api.openai.com/v1" # Direkter LLM-Fallback
  fallback_api_key: ""                      # API-Schluessel fuer Fallback-URL
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Warnungen + Fehler konvertieren
    max_skills_per_run: 3
```

```bash
# 3. Wie gewohnt ausfuehren — MetaClaw arbeitet transparent
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

Nach jedem Durchlauf kannst du `~/.metaclaw/skills/arc-*/SKILL.md` pruefen, um die erlernten Skills deiner Pipeline zu sehen.

### Experimentergebnisse

In kontrollierten A/B-Experimenten (gleiches Thema, gleiches LLM, gleiche Konfiguration):

| Metrik | Baseline | Mit MetaClaw | Verbesserung |
|--------|----------|--------------|--------------|
| Stufen-Wiederholungsrate | 10.5% | 7.9% | **-24.8%** |
| Anzahl REFINE-Zyklen | 2.0 | 1.2 | **-40.0%** |
| Pipeline-Stufenabschluss | 18/19 | 19/19 | **+5.3%** |
| Gesamtrobustheitswert (Komposit) | 0.714 | 0.845 | **+18.3%** |

> Der Komposit-Robustheitswert ist ein gewichteter Durchschnitt aus Stufenabschlussrate (40%), Wiederholungsreduktion (30%) und REFINE-Zykluseffizienz (30%).

### Abwaertskompatibilitaet

- **Standard: AUS.** Wenn `metaclaw_bridge` fehlt oder `enabled: false`, verhaelt sich die Pipeline exakt wie zuvor.
- **Keine neuen Abhaengigkeiten.** MetaClaw ist optional — die Kern-Pipeline funktioniert ohne.
- **Alle 1.823 bestehenden Tests bestehen** mit dem Integrationscode.

---

## ⚙️ Konfigurationsreferenz

<details>
<summary>Klicken zum Aufklappen der vollstaendigen Konfigurationsreferenz</summary>

```yaml
# === Projekt ===
project:
  name: "my-research"              # Projektbezeichner
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Forschung ===
research:
  topic: "..."                     # Forschungsthema (erforderlich)
  domains: ["ml", "nlp"]           # Forschungsdomaenen fuer Literatursuche
  daily_paper_count: 8             # Ziel-Paperzahl pro Suchabfrage
  quality_threshold: 4.0           # Mindestqualitaetswert fuer Paper

# === Laufzeit ===
runtime:
  timezone: "America/New_York"     # Fuer Zeitstempel
  max_parallel_tasks: 3            # Limit gleichzeitiger Experimente
  approval_timeout_hours: 12       # Gate-Stufen-Timeout
  retry_limit: 2                   # Wiederholungsanzahl bei Stufenfehler

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # API-Endpunkt (erforderlich fuer openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Umgebungsvariable fuer API-Schluessel (erforderlich fuer openai-compatible)
  api_key: ""                      # Oder Schluessel direkt eintragen
  primary_model: "gpt-4o"          # Primaeres Modell
  fallback_models: ["gpt-4o-mini"] # Fallback-Kette
  s2_api_key: ""                   # Semantic Scholar API-Schluessel (optional, hoehere Rate-Limits)
  acp:                             # Nur verwendet wenn provider: "acp"
    agent: "claude"                # ACP-Agent-CLI-Befehl (claude, codex, gemini, etc.)
    cwd: "."                       # Arbeitsverzeichnis fuer den Agenten

# === Experiment ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Max. Ausfuehrungszeit pro Durchlauf (Standard: 300s)
  max_iterations: 10               # Max. Optimierungsiterationen
  metric_key: "val_loss"           # Primaerer Metrikname
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Automatische Import-Erkennung → requirements.txt
  ssh_remote:
    host: ""                       # GPU-Server-Hostname
    gpu_ids: []                    # Verfuegbare GPU-IDs
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (auto-installiert ueber `researchclaw setup`)
    enabled: true                    # Hauptschalter (Standard: true)
    auto: true                       # Auto-Ausloesung ohne Bestaetigung (Standard: true)
    complexity_threshold: 0.2        # 0.0-1.0 — hoeher = nur bei komplexen Experimenten ausloesen
    model: ""                        # Modell ueberschreiben (leer = llm.primary_model verwenden)
    timeout_sec: 600                 # Max. Sekunden fuer OpenCode-Generierung
    max_retries: 1                   # Wiederholungsanzahl bei Fehler
    workspace_cleanup: true          # Temporaeren Workspace nach Sammlung entfernen

# === Export ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # Pfad zur benutzerdefinierten Prompts-YAML (leer = Standardwerte)

# === Sicherheit ===
security:
  hitl_required_stages: [5, 9, 20] # Stufen, die menschliche Genehmigung erfordern
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === Wissensdatenbank ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Benachrichtigungen ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge (Optional) ===
metaclaw_bridge:
  enabled: false                   # Auf true setzen fuer durchlaufuebergreifendes Lernen
  proxy_url: "http://localhost:30000"  # MetaClaw-Proxy-URL
  skills_dir: "~/.metaclaw/skills" # Wo arc-* Skills gespeichert werden
  fallback_url: ""                 # Direkter LLM-Fallback wenn Proxy nicht erreichbar
  fallback_api_key: ""             # API-Schluessel fuer Fallback-Endpunkt
  lesson_to_skill:
    enabled: true                  # Lektionen automatisch in Skills konvertieren
    min_severity: "warning"        # Mindestschwere fuer Konvertierung
    max_skills_per_run: 3          # Max. neue Skills pro Pipeline-Durchlauf

# === OpenClaw Bridge ===
openclaw_bridge:
  use_cron: false                  # Geplante Forschungsdurchlaeufe
  use_message: false               # Fortschrittsbenachrichtigungen
  use_memory: false                # Sitzungsuebergreifende Wissenspersistenz
  use_sessions_spawn: false        # Parallele Sub-Sessions starten
  use_web_fetch: false             # Live-Websuche
  use_browser: false               # Browserbasierte Paper-Sammlung
```

</details>

---

## 🙏 Danksagungen

Inspiriert von:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionier der automatisierten Forschung
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — End-to-End-Forschungsautomatisierung
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System

---

## 📄 Lizenz

MIT — siehe [LICENSE](../LICENSE) fuer Details.

---

## 📌 Zitation

Wenn du AutoResearchClaw nuetzlich findest, zitiere bitte:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Gebaut mit 🦞 vom AutoResearchClaw-Team</sub>
</p>


================================================
FILE: docs/README_ES.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Comparte una idea. Obten un articulo. Totalmente autonomo & autoevolutivo.</b></h2>


<p align="center">
  <b><i><font size="5">Chatea con <a href="#-integracion-con-openclaw">OpenClaw</a>: "Investiga X" → hecho.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#testing"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#-integracion-con-openclaw"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 Galeria de articulos</a> · <a href="integration-guide.md">📖 Guia de integracion</a> · <a href="https://discord.gg/u4ksqW5P">💬 Comunidad Discord</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 Galeria de articulos generados</b><br><br>
<b>8 articulos en 8 dominios</b> — matematicas, estadistica, biologia, computacion, NLP, RL, vision, robustez — generados de forma completamente autonoma sin intervencion humana.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 Buscamos testers!** Prueba el pipeline con tu propia idea de investigacion — de cualquier campo — y [cuentanos que piensas](TESTER_GUIDE.md). Tu feedback da forma directamente a la proxima version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Soporte multiplataforma + estabilidad mayor** — AutoResearchClaw ahora funciona con cualquier agente compatible con ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) y soporta plataformas de mensajeria (Discord, Telegram, Lark, WeChat) via el puente OpenClaw. Nuevo backend de generacion de codigo CLI-agent que delega las Stages 10 y 13 a agentes CLI externos con control de presupuesto y gestion de timeouts. Incluye sistema anti-fabricacion (VerifiedRegistry + bucle de diagnostico y reparacion), 100+ correcciones de bugs, refactorizacion modular del executor, auto-deteccion de `--resume`, endurecimiento de reintentos LLM y correcciones de la comunidad.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-integracion-metaclaw).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ Un comando. Un articulo.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 Que es esto?

**Tu lo piensas. AutoResearchClaw lo escribe.**

Proporciona un tema de investigacion — recibe un articulo academico completo con literatura real de OpenAlex, Semantic Scholar y arXiv, experimentos en sandbox adaptados al hardware (deteccion automatica GPU/MPS/CPU), analisis estadistico, revision multi-agentes, y LaTeX listo para conferencia orientado a NeurIPS/ICML/ICLR. Sin supervision. Sin copiar y pegar. Sin referencias alucinadas.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Articulo academico completo (Introduccion, Trabajo relacionado, Metodo, Experimentos, Resultados, Conclusion)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>LaTeX listo para conferencia (plantillas NeurIPS / ICLR / ICML)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>Referencias BibTeX reales de OpenAlex, Semantic Scholar y arXiv — auto-depuradas para coincidir con las citas en linea</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>Verificacion de integridad + relevancia de citas en 4 capas (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Codigo generado + resultados en sandbox + metricas JSON estructuradas</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Graficos de comparacion de condiciones auto-generados con barras de error e intervalos de confianza</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Revision por pares multi-agente con verificacion de consistencia metodologia-evidencia</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Lecciones de auto-aprendizaje extraidas de cada ejecucion</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>Todos los entregables finales en una sola carpeta — listos para compilar en Overleaf</td></tr>
</table>

El pipeline se ejecuta **de principio a fin sin intervencion humana**. Cuando los experimentos fallan, se auto-repara. Cuando las hipotesis no se sostienen, pivotea. Cuando las citas son falsas, las elimina.

🌍 **Ejecutalo en cualquier lugar.** AutoResearchClaw no esta atado a una sola plataforma. Usalo de forma independiente por CLI, conectalo a [OpenClaw](https://github.com/openclaw/openclaw), o integralo con cualquier agente compatible con ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, y mas. Gracias al puente de mensajeria de OpenClaw, puedes iniciar una investigacion completa desde 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, o cualquier plataforma que tu equipo ya utilice. Un tema de entrada, un paper de salida — sin importar donde lo escribas.

---

## 🚀 Inicio rapido

```bash
# 1. Clonar e instalar
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Setup (interactivo — instala OpenCode beast mode, verifica Docker/LaTeX)
researchclaw setup

# 3. Configurar
researchclaw init          # Interactivo: elegir proveedor LLM, crea config.arc.yaml
# O manualmente: cp config.researchclaw.example.yaml config.arc.yaml

# 4. Ejecutar
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

Salida → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — LaTeX listo para compilar, BibTeX, codigo experimental, graficos.

<details>
<summary>📝 Configuracion minima requerida</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 Que lo hace diferente

| Capacidad | Como funciona |
|-----------|--------------|
| **🔄 Bucle PIVOT / REFINE** | La etapa 15 decide de forma autonoma: PROCEED, REFINE (ajustar parametros) o PIVOT (nueva direccion). Artefactos auto-versionados. |
| **🤖 Debate multi-agente** | La generacion de hipotesis, el analisis de resultados y la revision por pares utilizan cada uno debate estructurado multi-perspectiva. |
| **🧬 Auto-aprendizaje** | Lecciones extraidas por ejecucion (justificacion de decisiones, advertencias de ejecucion, anomalias de metricas) con decaimiento temporal de 30 dias. Las ejecuciones futuras aprenden de errores pasados. |
| **📚 Base de conocimiento** | Cada ejecucion construye una KB estructurada en 6 categorias (decisiones, experimentos, hallazgos, literatura, preguntas, revisiones). |
| **🛡️ Vigilante Sentinel** | Monitor de calidad en segundo plano: deteccion NaN/Inf, consistencia articulo-evidencia, puntuacion de relevancia de citas, guardia anti-fabricacion. |

---

## 🦞 Integracion con OpenClaw

<table>
<tr>

**AutoResearchClaw es un servicio compatible con [OpenClaw](https://github.com/openclaw/openclaw).** Instalalo en OpenClaw y lanza investigacion autonoma con un solo mensaje — o usalo de forma independiente via CLI, Claude Code o cualquier asistente de programacion con IA.

</tr>
</table>

### 🚀 Uso con OpenClaw (Recomendado)

Si ya usas [OpenClaw](https://github.com/openclaw/openclaw) como tu asistente de IA:

```
1️⃣  Comparte la URL del repositorio de GitHub con OpenClaw
2️⃣  OpenClaw lee automaticamente RESEARCHCLAW_AGENTS.md → comprende el pipeline
3️⃣  Di: "Research [tu tema]"
4️⃣  Listo — OpenClaw clona, instala, configura, ejecuta y devuelve los resultados
```

**Eso es todo.** OpenClaw se encarga de `git clone`, `pip install`, configuracion y ejecucion del pipeline automaticamente. Tu solo chateas.

<details>
<summary>💡 Que sucede internamente</summary>

1. OpenClaw lee `RESEARCHCLAW_AGENTS.md` → aprende el rol de orquestador de investigacion
2. OpenClaw lee `README.md` → comprende la instalacion y la estructura del pipeline
3. OpenClaw copia `config.researchclaw.example.yaml` → `config.yaml`
4. Solicita tu clave API del LLM (o usa tu variable de entorno)
5. Ejecuta `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. Devuelve el articulo, LaTeX, experimentos y citas

</details>

### 🔌 Bridge de OpenClaw (Avanzado)

Para una integracion mas profunda, AutoResearchClaw incluye un **sistema de adaptadores bridge** con 6 capacidades opcionales:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Ejecuciones de investigacion programadas
  use_message: true           # 💬 Notificaciones de progreso (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Persistencia de conocimiento entre sesiones
  use_sessions_spawn: true    # 🔀 Generar sub-sesiones paralelas para etapas concurrentes
  use_web_fetch: true         # 🌐 Busqueda web en vivo durante la revision de literatura
  use_browser: false          # 🖥️ Recopilacion de articulos basada en navegador
```

Cada flag activa un protocolo de adaptador tipado. Cuando OpenClaw proporciona estas capacidades, los adaptadores las consumen sin cambios en el codigo. Consulta [`integration-guide.md`](integration-guide.md) para mas detalles.

### ACP (Agent Client Protocol)

AutoResearchClaw puede usar **cualquier agente de programacion compatible con ACP** como backend LLM — sin necesidad de claves API. El agente se comunica via [acpx](https://github.com/openclaw/acpx), manteniendo una sola sesion persistente a traves de las 23 etapas del pipeline.

| Agente | Comando | Notas |
|--------|---------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ejemplo ACP
llm:
  provider: "acp"
  acp:
    agent: "claude"   # Cualquier comando CLI de agente compatible con ACP
    cwd: "."          # Directorio de trabajo para el agente
  # No se necesita base_url ni api_key — el agente gestiona su propia autenticacion.
```

```bash
# Solo ejecuta — el agente usa sus propias credenciales
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ Otras formas de ejecucion

| Metodo | Como |
|--------|------|
| **CLI independiente** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **API de Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Lee `RESEARCHCLAW_CLAUDE.md` — solo di *"Run research on [tema]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` con `llm.acp.agent: "gh"` |
| **OpenCode** | Lee `.claude/skills/` — la misma interfaz en lenguaje natural |
| **Cualquier CLI de IA** | Proporciona `RESEARCHCLAW_AGENTS.md` como contexto → el agente se auto-configura |

---

## 🔬 Pipeline: 23 etapas, 8 fases

```
Fase A: Alcance de investigacion     Fase E: Ejecucion de experimentos
  1. TOPIC_INIT                        12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE                 13. ITERATIVE_REFINE  ← auto-reparacion

Fase B: Descubrimiento de literatura Fase F: Analisis y decision
  3. SEARCH_STRATEGY                   14. RESULT_ANALYSIS    ← multi-agente
  4. LITERATURE_COLLECT  ← API real    15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [compuerta]
  6. KNOWLEDGE_EXTRACT                 Fase G: Redaccion del articulo
                                       16. PAPER_OUTLINE
Fase C: Sintesis de conocimiento       17. PAPER_DRAFT
  7. SYNTHESIS                         18. PEER_REVIEW        ← verif. evidencia
  8. HYPOTHESIS_GEN    ← debate        19. PAPER_REVISION

Fase D: Diseno experimental          Fase H: Finalizacion
  9. EXPERIMENT_DESIGN   [compuerta]   20. QUALITY_GATE      [compuerta]
 10. CODE_GENERATION                   21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING                 22. EXPORT_PUBLISH     ← LaTeX
                                       23. CITATION_VERIFY    ← verif. relevancia
```

> Las **etapas con compuerta** (5, 9, 20) se pausan para aprobacion humana o se auto-aprueban con `--auto-approve`. Al rechazar, el pipeline retrocede.

> **Bucles de decision**: La etapa 15 puede activar REFINE (→ Etapa 13) o PIVOT (→ Etapa 8), con versionado automatico de artefactos.

<details>
<summary>📋 Que hace cada fase</summary>

| Fase | Que sucede |
|------|-----------|
| **A: Alcance** | El LLM descompone el tema en un arbol de problemas estructurado con preguntas de investigacion |
| **A+: Hardware** | Deteccion automatica de GPU (NVIDIA CUDA / Apple MPS / solo CPU), advierte si el hardware local es limitado, adapta la generacion de codigo en consecuencia |
| **B: Literatura** | Busqueda multi-fuente (OpenAlex → Semantic Scholar → arXiv) de articulos reales, filtrado por relevancia, extraccion de fichas de conocimiento |
| **C: Sintesis** | Agrupa hallazgos, identifica brechas de investigacion, genera hipotesis comprobables mediante debate multi-agente |
| **D: Diseno** | Disena plan experimental, genera Python ejecutable adaptado al hardware (nivel de GPU → seleccion de paquetes), estima necesidades de recursos |
| **E: Ejecucion** | Ejecuta experimentos en sandbox, detecta NaN/Inf y errores en tiempo de ejecucion, auto-repara codigo mediante reparacion LLM dirigida |
| **F: Analisis** | Analisis multi-agente de resultados; decision autonoma PROCEED / REFINE / PIVOT con justificacion |
| **G: Redaccion** | Esquema → redaccion seccion por seccion (5,000-6,500 palabras) → revision por pares (con consistencia metodologia-evidencia) → revision con guardia de longitud |
| **H: Finalizacion** | Compuerta de calidad, archivado de conocimiento, exportacion LaTeX con plantilla de conferencia, verificacion de integridad + relevancia de citas |

</details>

---

## ✨ Caracteristicas principales

| Caracteristica | Descripcion |
|----------------|------------|
| **📚 Literatura multi-fuente** | Articulos reales de OpenAlex, Semantic Scholar y arXiv — expansion de consultas, deduplicacion, circuit breaker con degradacion gradual |
| **🔍 Verificacion de citas en 4 capas** | Verificacion de arXiv ID → DOI CrossRef/DataCite → coincidencia de titulo Semantic Scholar → puntuacion de relevancia LLM. Referencias alucinadas auto-eliminadas. |
| **🖥️ Ejecucion adaptada al hardware** | Deteccion automatica de GPU (NVIDIA CUDA / Apple MPS / solo CPU) y adaptacion de la generacion de codigo, imports y escala experimental |
| **🦾 OpenCode Beast Mode** | Los experimentos complejos se enrutan automaticamente a [OpenCode](https://github.com/anomalyco/opencode) — genera proyectos multi-archivo con arquitecturas personalizadas, bucles de entrenamiento y estudios de ablacion. Instalacion via `researchclaw setup`. |
| **🧪 Experimentos en sandbox** | Codigo validado por AST, harness inmutable, fallo rapido NaN/Inf, reparacion auto-curativa, refinamiento iterativo (hasta 10 rondas), captura de resultados parciales |
| **📝 Redaccion de calidad conferencia** | Plantillas NeurIPS/ICML/ICLR, redaccion seccion por seccion (5,000-6,500 palabras), guardia anti-fabricacion, guardia de longitud en revision, enforcement anti-disclaimer |
| **📐 Cambio de plantilla** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX con formulas, tablas, figuras, referencias cruzadas, `\cite{}` |
| **🚦 Compuertas de calidad** | 3 compuertas con intervencion humana posible (etapas 5, 9, 20) con retroceso. Omitir con `--auto-approve`. |

---

## 🧠 Integracion MetaClaw

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Un pipeline que aprende de cada ejecucion.**

MetaClaw agrega **transferencia de conocimiento entre ejecuciones** a AutoResearchClaw. Cuando esta habilitado, el pipeline captura automaticamente lecciones de fallos y advertencias, las convierte en habilidades reutilizables, e inyecta esas habilidades en las 23 etapas del pipeline en ejecuciones posteriores — para que los mismos errores nunca se repitan.

### Como funciona

```
Ejecucion N se ejecuta → fallos/advertencias capturados como Lecciones
                      ↓
          MetaClaw Leccion → conversion a Habilidad
                      ↓
          Archivos de habilidades arc-* almacenados en ~/.metaclaw/skills/
                      ↓
Ejecucion N+1 → build_overlay() inyecta habilidades en cada prompt LLM
                      ↓
          El LLM evita trampas conocidas → mayor calidad, menos reintentos
```

### Configuracion rapida

```bash
# 1. Instalar MetaClaw (si no esta instalado)
pip install metaclaw

# 2. Habilitar en tu configuracion
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # Proxy MetaClaw (opcional)
  skills_dir: "~/.metaclaw/skills"          # Donde se almacenan las habilidades
  fallback_url: "https://api.openai.com/v1" # Fallback directo al LLM
  fallback_api_key: ""                      # Clave API para la URL de fallback
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Convertir advertencias + errores
    max_skills_per_run: 3
```

```bash
# 3. Ejecuta como siempre — MetaClaw funciona de forma transparente
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

Despues de cada ejecucion, revisa `~/.metaclaw/skills/arc-*/SKILL.md` para ver las habilidades que tu pipeline ha aprendido.

### Resultados experimentales

En experimentos controlados A/B (mismo tema, mismo LLM, misma configuracion):

| Metrica | Linea base | Con MetaClaw | Mejora |
|---------|------------|--------------|--------|
| Tasa de reintento de etapas | 10.5% | 7.9% | **-24.8%** |
| Conteo de ciclos REFINE | 2.0 | 1.2 | **-40.0%** |
| Completacion de etapas del pipeline | 18/19 | 19/19 | **+5.3%** |
| Puntuacion de robustez global (compuesta) | 0.714 | 0.845 | **+18.3%** |

> La puntuacion de robustez compuesta es un promedio ponderado de la tasa de completacion de etapas (40%), reduccion de reintentos (30%) y eficiencia de ciclos REFINE (30%).

### Retrocompatibilidad

- **Por defecto: DESACTIVADO.** Si `metaclaw_bridge` esta ausente o `enabled: false`, el pipeline se comporta exactamente como antes.
- **Sin nuevas dependencias.** MetaClaw es opcional — el pipeline base funciona sin el.
- **Los 1,823 tests existentes pasan** con el codigo de integracion presente.

---

## ⚙️ Referencia de configuracion

<details>
<summary>Haz clic para expandir la referencia completa de configuracion</summary>

```yaml
# === Proyecto ===
project:
  name: "my-research"              # Identificador del proyecto
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Investigacion ===
research:
  topic: "..."                     # Tema de investigacion (requerido)
  domains: ["ml", "nlp"]           # Dominios de investigacion para busqueda de literatura
  daily_paper_count: 8             # Articulos objetivo por consulta de busqueda
  quality_threshold: 4.0           # Puntuacion minima de calidad para articulos

# === Tiempo de ejecucion ===
runtime:
  timezone: "America/New_York"     # Para marcas de tiempo
  max_parallel_tasks: 3            # Limite de experimentos concurrentes
  approval_timeout_hours: 12       # Timeout de etapas con compuerta
  retry_limit: 2                   # Numero de reintentos por fallo de etapa

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # Endpoint de API (requerido para openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Variable de entorno para la clave API (requerido para openai-compatible)
  api_key: ""                      # O codifica la clave aqui directamente
  primary_model: "gpt-4o"          # Modelo principal
  fallback_models: ["gpt-4o-mini"] # Cadena de fallback
  s2_api_key: ""                   # Clave API de Semantic Scholar (opcional, mayores limites de tasa)
  acp:                             # Solo se usa cuando provider: "acp"
    agent: "claude"                # Comando CLI del agente ACP (claude, codex, gemini, etc.)
    cwd: "."                       # Directorio de trabajo para el agente

# === Experimento ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Tiempo maximo de ejecucion por corrida (por defecto: 300s)
  max_iterations: 10               # Maximo de iteraciones de optimizacion
  metric_key: "val_loss"           # Nombre de la metrica principal
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Deteccion automatica de imports → requirements.txt
  ssh_remote:
    host: ""                       # Nombre de host del servidor GPU
    gpu_ids: []                    # IDs de GPU disponibles
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (auto-instalado via `researchclaw setup`)
    enabled: true                    # Interruptor principal (por defecto: true)
    auto: true                       # Auto-activacion sin confirmacion (por defecto: true)
    complexity_threshold: 0.2        # 0.0-1.0 — mas alto = solo se activa para experimentos complejos
    model: ""                        # Modelo a forzar (vacio = usa llm.primary_model)
    timeout_sec: 600                 # Segundos maximos para generacion OpenCode
    max_retries: 1                   # Numero de reintentos por fallo
    workspace_cleanup: true          # Eliminar workspace temporal despues de recoleccion

# === Exportacion ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # Ruta a YAML de prompts personalizados (vacio = valores por defecto)

# === Seguridad ===
security:
  hitl_required_stages: [5, 9, 20] # Etapas que requieren aprobacion humana
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === Base de conocimiento ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Notificaciones ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === Puente MetaClaw (Opcional) ===
metaclaw_bridge:
  enabled: false                   # Establecer en true para habilitar aprendizaje entre ejecuciones
  proxy_url: "http://localhost:30000"  # URL del proxy MetaClaw
  skills_dir: "~/.metaclaw/skills" # Donde se almacenan las habilidades arc-*
  fallback_url: ""                 # Fallback directo al LLM cuando el proxy esta caido
  fallback_api_key: ""             # Clave API para el endpoint de fallback
  lesson_to_skill:
    enabled: true                  # Convertir lecciones en habilidades automaticamente
    min_severity: "warning"        # Severidad minima para conversion
    max_skills_per_run: 3          # Max de nuevas habilidades por ejecucion del pipeline

# === Bridge de OpenClaw ===
openclaw_bridge:
  use_cron: false                  # Ejecuciones de investigacion programadas
  use_message: false               # Notificaciones de progreso
  use_memory: false                # Persistencia de conocimiento entre sesiones
  use_sessions_spawn: false        # Generar sub-sesiones paralelas
  use_web_fetch: false             # Busqueda web en vivo
  use_browser: false               # Recopilacion de articulos basada en navegador
```

</details>

---

## 🙏 Agradecimientos

Inspirado por:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionero en investigacion automatizada
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automatizacion de investigacion de principio a fin
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Sistema de investigacion completamente automatizado

---

## 📄 Licencia

MIT — consulta [LICENSE](../LICENSE) para mas detalles.

---

## 📌 Citacion

Si encuentras AutoResearchClaw util, por favor cita:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Construido con 🦞 por el equipo de AutoResearchClaw</sub>
</p>


================================================
FILE: docs/README_FR.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Discutez une idee. Obtenez un article. Entierement autonome & auto-evolutif.</b></h2>


<p align="center">
  <b><i><font size="5">Discutez avec <a href="#-integration-openclaw">OpenClaw</a> : "Recherche X" → termine.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#testing"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#-integration-openclaw"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 Vitrine des articles</a> · <a href="integration-guide.md">📖 Guide d'integration</a> · <a href="https://discord.gg/u4ksqW5P">💬 Communaute Discord</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 Vitrine des articles generes</b><br><br>
<b>8 articles couvrant 8 domaines</b> — mathematiques, statistiques, biologie, informatique, NLP, RL, vision, robustesse — generes de maniere entierement autonome sans aucune intervention humaine.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 Nous recherchons des testeurs !** Essayez le pipeline avec votre propre idee de recherche — dans n'importe quel domaine — et [dites-nous ce que vous en pensez](TESTER_GUIDE.md). Vos retours faconnent directement la prochaine version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Support multiplateforme + stabilite majeure** — AutoResearchClaw fonctionne desormais avec tout agent compatible ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) et supporte les plateformes de messagerie (Discord, Telegram, Lark, WeChat) via le pont OpenClaw. Nouveau backend de generation de code CLI-agent qui delegue les Stages 10 et 13 a des agents CLI externes avec controle de budget et gestion des timeouts. Inclut le systeme anti-fabrication (VerifiedRegistry + boucle diagnostic/reparation), 100+ corrections de bugs, refactoring modulaire de l'executor, auto-detection `--resume`, renforcement des retries LLM, et corrections communautaires.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-integration-metaclaw).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ Une commande. Un article.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 De quoi s'agit-il ?

**Vous y pensez. AutoResearchClaw l'ecrit.**

Donnez un sujet de recherche — recevez un article academique complet avec de la vraie litterature provenant d'OpenAlex, Semantic Scholar et arXiv, des experiences en sandbox adaptees au materiel (detection automatique GPU/MPS/CPU), une analyse statistique, une relecture multi-agents, et du LaTeX pret pour les conferences ciblant NeurIPS/ICML/ICLR. Aucune supervision. Aucun copier-coller. Aucune reference hallucinee.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Article academique complet (Introduction, Travaux connexes, Methode, Experiences, Resultats, Conclusion)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>LaTeX pret pour les conferences (templates NeurIPS / ICLR / ICML)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>References BibTeX reelles provenant d'OpenAlex, Semantic Scholar et arXiv — auto-elaguees pour correspondre aux citations dans le texte</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>Verification d'integrite et de pertinence des citations sur 4 couches (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Code genere + resultats sandbox + metriques JSON structurees</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Graphiques de comparaison de conditions auto-generes avec barres d'erreur et intervalles de confiance</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Relecture multi-agents avec verification de coherence methodologie-preuves</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Lecons d'auto-apprentissage extraites de chaque execution</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>Tous les livrables finaux dans un seul dossier — pret a compiler pour Overleaf</td></tr>
</table>

Le pipeline s'execute **de bout en bout sans intervention humaine**. Quand les experiences echouent, il s'auto-repare. Quand les hypotheses ne tiennent pas, il pivote. Quand les citations sont fausses, il les supprime.

🌍 **Utilisable partout.** AutoResearchClaw n'est pas verrouille sur une seule plateforme. Utilisez-le en CLI autonome, connectez-le a [OpenClaw](https://github.com/openclaw/openclaw), ou integrez-le avec n'importe quel agent compatible ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, et bien d'autres. Grace au pont de messagerie d'OpenClaw, vous pouvez lancer une recherche complete depuis 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, ou la plateforme que votre equipe utilise deja. Un sujet en entree, un article en sortie — peu importe d'ou vous l'envoyez.

---

## 🚀 Demarrage rapide

```bash
# 1. Cloner & installer
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Setup (interactif — installe OpenCode beast mode, verifie Docker/LaTeX)
researchclaw setup

# 3. Configurer
researchclaw init          # Interactif : choisir le fournisseur LLM, cree config.arc.yaml
# Ou manuellement : cp config.researchclaw.example.yaml config.arc.yaml

# 4. Executer
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

Sortie → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — LaTeX pret a compiler, BibTeX, code d'experience, graphiques.

<details>
<summary>📝 Configuration minimale requise</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 Ce qui le distingue

| Capacite | Fonctionnement |
|----------|---------------|
| **🔄 Boucle PIVOT / REFINE** | L'etape 15 decide de maniere autonome : PROCEED, REFINE (ajuster les parametres) ou PIVOT (nouvelle direction). Artefacts auto-versionnes. |
| **🤖 Debat multi-agents** | La generation d'hypotheses, l'analyse de resultats et la relecture par les pairs utilisent chacune un debat structure multi-perspectives. |
| **🧬 Auto-apprentissage** | Lecons extraites a chaque execution (justification des decisions, avertissements d'execution, anomalies de metriques) avec decroissance temporelle a 30 jours. Les executions futures apprennent des erreurs passees. |
| **📚 Base de connaissances** | Chaque execution construit une KB structuree couvrant 6 categories (decisions, experiences, resultats, litterature, questions, relectures). |
| **🛡️ Sentinel Watchdog** | Moniteur de qualite en arriere-plan : detection NaN/Inf, coherence article-preuves, score de pertinence des citations, protection anti-fabrication. |

---

## 🦞 Integration OpenClaw

<table>
<tr>

**AutoResearchClaw est un service compatible [OpenClaw](https://github.com/openclaw/openclaw).** Installez-le dans OpenClaw et lancez une recherche autonome avec un seul message — ou utilisez-le de maniere autonome via CLI, Claude Code, ou tout assistant de codage IA.

</tr>
</table>

### 🚀 Utilisation avec OpenClaw (recommande)

Si vous utilisez deja [OpenClaw](https://github.com/openclaw/openclaw) comme assistant IA :

```
1️⃣  Partagez l'URL du depot GitHub avec OpenClaw
2️⃣  OpenClaw lit automatiquement RESEARCHCLAW_AGENTS.md → comprend le pipeline
3️⃣  Dites : "Research [votre sujet]"
4️⃣  C'est fait — OpenClaw clone, installe, configure, execute et renvoie les resultats
```

**C'est tout.** OpenClaw gere `git clone`, `pip install`, la configuration et l'execution du pipeline automatiquement. Vous n'avez qu'a discuter.

<details>
<summary>💡 Ce qui se passe en coulisses</summary>

1. OpenClaw lit `RESEARCHCLAW_AGENTS.md` → apprend le role d'orchestrateur de recherche
2. OpenClaw lit `README.md` → comprend l'installation et la structure du pipeline
3. OpenClaw copie `config.researchclaw.example.yaml` → `config.yaml`
4. Demande votre cle API LLM (ou utilise votre variable d'environnement)
5. Execute `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. Renvoie l'article, le LaTeX, les experiences et les citations

</details>

### 🔌 Pont OpenClaw (avance)

Pour une integration plus poussee, AutoResearchClaw inclut un **systeme d'adaptateurs pont** avec 6 fonctionnalites optionnelles :

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Executions de recherche planifiees
  use_message: true           # 💬 Notifications de progression (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Persistance des connaissances inter-sessions
  use_sessions_spawn: true    # 🔀 Lancement de sous-sessions paralleles pour les etapes concurrentes
  use_web_fetch: true         # 🌐 Recherche web en direct pendant la revue de litterature
  use_browser: false          # 🖥️ Collecte d'articles via navigateur
```

Chaque option active un protocole d'adaptateur type. Quand OpenClaw fournit ces fonctionnalites, les adaptateurs les consomment sans modification de code. Voir [`integration-guide.md`](integration-guide.md) pour tous les details.

### ACP (Agent Client Protocol)

AutoResearchClaw peut utiliser **n'importe quel agent de codage compatible ACP** comme backend LLM — sans cle API requise. L'agent communique via [acpx](https://github.com/openclaw/acpx), en maintenant une session persistante unique a travers les 23 etapes du pipeline.

| Agent | Commande | Notes |
|-------|----------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — exemple ACP
llm:
  provider: "acp"
  acp:
    agent: "claude"   # N'importe quel agent CLI compatible ACP
    cwd: "."          # Repertoire de travail pour l'agent
  # Pas besoin de base_url ou api_key — l'agent gere sa propre authentification.
```

```bash
# Executez simplement — l'agent utilise ses propres identifiants
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ Autres methodes d'execution

| Methode | Comment |
|---------|---------|
| **CLI autonome** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **API Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Lit `RESEARCHCLAW_CLAUDE.md` — dites simplement *"Run research on [sujet]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` avec `llm.acp.agent: "gh"` |
| **OpenCode** | Lit `.claude/skills/` — meme interface en langage naturel |
| **Tout CLI IA** | Fournissez `RESEARCHCLAW_AGENTS.md` comme contexte → l'agent s'auto-initialise |

---

## 🔬 Pipeline : 23 etapes, 8 phases

```
Phase A : Cadrage de la recherche     Phase E : Execution des experiences
  1. TOPIC_INIT                         12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE                  13. ITERATIVE_REFINE  ← auto-reparation

Phase B : Decouverte de litterature   Phase F : Analyse et decision
  3. SEARCH_STRATEGY                    14. RESULT_ANALYSIS    ← multi-agents
  4. LITERATURE_COLLECT  ← API reelle   15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [porte]
  6. KNOWLEDGE_EXTRACT                  Phase G : Redaction de l'article
                                        16. PAPER_OUTLINE
Phase C : Synthese des connaissances    17. PAPER_DRAFT
  7. SYNTHESIS                          18. PEER_REVIEW        ← verif. preuves
  8. HYPOTHESIS_GEN    ← debat          19. PAPER_REVISION

Phase D : Conception experimentale    Phase H : Finalisation
  9. EXPERIMENT_DESIGN   [porte]        20. QUALITY_GATE      [porte]
 10. CODE_GENERATION                    21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING                  22. EXPORT_PUBLISH     ← LaTeX
                                        23. CITATION_VERIFY    ← verif. pertinence
```

> **Etapes de validation** (5, 9, 20) : pause pour approbation humaine ou approbation automatique avec `--auto-approve`. En cas de rejet, le pipeline revient en arriere.

> **Boucles de decision** : l'etape 15 peut declencher REFINE (→ etape 13) ou PIVOT (→ etape 8), avec versionnement automatique des artefacts.

<details>
<summary>📋 Ce que fait chaque phase</summary>

| Phase | Ce qui se passe |
|-------|-----------------|
| **A : Cadrage** | Le LLM decompose le sujet en un arbre de problemes structure avec des questions de recherche |
| **A+ : Materiel** | Detection automatique du GPU (NVIDIA CUDA / Apple MPS / CPU uniquement), avertissement si le materiel local est limite, adaptation de la generation de code en consequence |
| **B : Litterature** | Recherche multi-sources (OpenAlex → Semantic Scholar → arXiv) de vrais articles, filtrage par pertinence, extraction de fiches de connaissances |
| **C : Synthese** | Regroupement des resultats, identification des lacunes de recherche, generation d'hypotheses testables via debat multi-agents |
| **D : Conception** | Conception du plan experimental, generation de Python executable adapte au materiel (niveau GPU → selection de packages), estimation des besoins en ressources |
| **E : Execution** | Execution des experiences en sandbox, detection de NaN/Inf et bugs d'execution, auto-reparation du code via reparation ciblee par LLM |
| **F : Analyse** | Analyse multi-agents des resultats ; decision autonome PROCEED / REFINE / PIVOT avec justification |
| **G : Redaction** | Plan → redaction section par section (5 000-6 500 mots) → relecture (avec verification de coherence methodologie-preuves) → revision avec controle de longueur |
| **H : Finalisation** | Porte qualite, archivage des connaissances, export LaTeX avec template de conference, verification d'integrite et de pertinence des citations |

</details>

---

## ✨ Fonctionnalites cles

| Fonctionnalite | Description |
|----------------|------------|
| **📚 Litterature multi-sources** | Vrais articles depuis OpenAlex, Semantic Scholar et arXiv — expansion de requetes, deduplication, disjoncteur avec degradation gracieuse |
| **🔍 Verification des citations en 4 couches** | Verification arXiv ID → DOI CrossRef/DataCite → correspondance de titre Semantic Scholar → score de pertinence LLM. References hallucinées auto-supprimees. |
| **🖥️ Execution adaptee au materiel** | Detection automatique du GPU (NVIDIA CUDA / Apple MPS / CPU uniquement) et adaptation de la generation de code, des imports et de l'echelle experimentale |
| **🦾 OpenCode Beast Mode** | Les experiences complexes sont automatiquement dirigees vers [OpenCode](https://github.com/anomalyco/opencode) — genere des projets multi-fichiers avec architectures personnalisees, boucles d'entrainement et etudes d'ablation. Installation via `researchclaw setup`. |
| **🧪 Experiences en sandbox** | Code valide par AST, harnais immuable, echec rapide NaN/Inf, reparation auto-guerison, raffinement iteratif (jusqu'a 10 tours), capture de resultats partiels |
| **📝 Redaction de qualite conference** | Templates NeurIPS/ICML/ICLR, redaction section par section (5 000-6 500 mots), protection anti-fabrication, controle de longueur en revision, application anti-clause de non-responsabilite |
| **📐 Changement de template** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX avec formules, tableaux, figures, references croisees, `\cite{}` |
| **🚦 Portes qualite** | 3 portes avec intervention humaine possible (etapes 5, 9, 20) avec retour en arriere. A passer avec `--auto-approve`. |

---

## 🧠 Integration MetaClaw

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Un pipeline qui apprend de chaque execution.**

MetaClaw ajoute le **transfert de connaissances inter-executions** a AutoResearchClaw. Lorsqu'il est active, le pipeline capture automatiquement les lecons des echecs et avertissements, les convertit en competences reutilisables, et injecte ces competences dans les 23 etapes du pipeline lors des executions suivantes — pour ne jamais repeter les memes erreurs.

### Fonctionnement

```
Execution N s'execute → echecs/avertissements captures comme Lecons
                      ↓
          MetaClaw Lecon → conversion en Competence
                      ↓
          Fichiers de competences arc-* stockes dans ~/.metaclaw/skills/
                      ↓
Execution N+1 → build_overlay() injecte les competences dans chaque prompt LLM
                      ↓
          Le LLM evite les pieges connus → meilleure qualite, moins de tentatives
```

### Configuration rapide

```bash
# 1. Installer MetaClaw (si ce n'est pas deja fait)
pip install metaclaw

# 2. Activer dans votre configuration
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # Proxy MetaClaw (optionnel)
  skills_dir: "~/.metaclaw/skills"          # Ou les competences sont stockees
  fallback_url: "https://api.openai.com/v1" # Repli direct vers le LLM
  fallback_api_key: ""                      # Cle API pour l'URL de repli
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Convertir avertissements + erreurs
    max_skills_per_run: 3
```

```bash
# 3. Executez comme d'habitude — MetaClaw fonctionne de maniere transparente
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

Apres chaque execution, verifiez `~/.metaclaw/skills/arc-*/SKILL.md` pour voir les competences que votre pipeline a apprises.

### Resultats experimentaux

Dans des experiences controlees A/B (meme sujet, meme LLM, meme configuration) :

| Metrique | Reference | Avec MetaClaw | Amelioration |
|----------|-----------|---------------|--------------|
| Taux de relance des etapes | 10.5% | 7.9% | **-24.8%** |
| Nombre de cycles REFINE | 2.0 | 1.2 | **-40.0%** |
| Completion des etapes du pipeline | 18/19 | 19/19 | **+5.3%** |
| Score de robustesse global (composite) | 0.714 | 0.845 | **+18.3%** |

> Le score de robustesse composite est une moyenne ponderee du taux de completion des etapes (40%), de la reduction des tentatives (30%) et de l'efficacite des cycles REFINE (30%).

### Retrocompatibilite

- **Par defaut : DESACTIVE.** Si `metaclaw_bridge` est absent ou `enabled: false`, le pipeline se comporte exactement comme avant.
- **Aucune nouvelle dependance.** MetaClaw est optionnel — le pipeline de base fonctionne sans.
- **Les 1 823 tests existants passent** avec le code d'integration present.

---

## ⚙️ Reference de configuration

<details>
<summary>Cliquez pour afficher la reference complete de configuration</summary>

```yaml
# === Projet ===
project:
  name: "my-research"              # Identifiant du projet
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Recherche ===
research:
  topic: "..."                     # Sujet de recherche (requis)
  domains: ["ml", "nlp"]           # Domaines de recherche pour la revue de litterature
  daily_paper_count: 8             # Nombre cible d'articles par requete de recherche
  quality_threshold: 4.0           # Score qualite minimum pour les articles

# === Execution ===
runtime:
  timezone: "America/New_York"     # Pour les horodatages
  max_parallel_tasks: 3            # Limite d'experiences concurrentes
  approval_timeout_hours: 12       # Timeout des etapes de validation
  retry_limit: 2                   # Nombre de tentatives en cas d'echec d'etape

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # Point d'acces API (requis pour openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Variable d'env pour la cle API (requis pour openai-compatible)
  api_key: ""                      # Ou cle en dur ici
  primary_model: "gpt-4o"          # Modele principal
  fallback_models: ["gpt-4o-mini"] # Chaine de repli
  s2_api_key: ""                   # Cle API Semantic Scholar (optionnel, limites de debit plus elevees)
  acp:                             # Utilise uniquement quand provider: "acp"
    agent: "claude"                # Commande CLI de l'agent ACP (claude, codex, gemini, etc.)
    cwd: "."                       # Repertoire de travail pour l'agent

# === Experience ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Temps d'execution max par lancement (defaut : 300s)
  max_iterations: 10               # Iterations d'optimisation max
  metric_key: "val_loss"           # Nom de la metrique principale
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Detection auto des imports → requirements.txt
  ssh_remote:
    host: ""                       # Nom d'hote du serveur GPU
    gpu_ids: []                    # Identifiants GPU disponibles
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (auto-installe via `researchclaw setup`)
    enabled: true                    # Interrupteur principal (defaut : true)
    auto: true                       # Declenchement auto sans confirmation (defaut : true)
    complexity_threshold: 0.2        # 0.0-1.0 — plus eleve = ne se declenche que pour les experiences complexes
    model: ""                        # Modele a forcer (vide = utilise llm.primary_model)
    timeout_sec: 600                 # Duree max en secondes pour la generation OpenCode
    max_retries: 1                   # Nombre de tentatives en cas d'echec
    workspace_cleanup: true          # Supprimer l'espace de travail temporaire apres collecte

# === Export ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # Chemin vers un YAML de prompts personnalises (vide = defauts)

# === Securite ===
security:
  hitl_required_stages: [5, 9, 20] # Etapes necessitant une approbation humaine
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === Base de connaissances ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Notifications ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === Pont MetaClaw (Optionnel) ===
metaclaw_bridge:
  enabled: false                   # Mettre a true pour activer l'apprentissage inter-executions
  proxy_url: "http://localhost:30000"  # URL du proxy MetaClaw
  skills_dir: "~/.metaclaw/skills" # Ou les competences arc-* sont stockees
  fallback_url: ""                 # Repli direct vers le LLM quand le proxy est indisponible
  fallback_api_key: ""             # Cle API pour le point d'acces de repli
  lesson_to_skill:
    enabled: true                  # Conversion automatique des lecons en competences
    min_severity: "warning"        # Severite minimum pour la conversion
    max_skills_per_run: 3          # Max de nouvelles competences par execution

# === Pont OpenClaw ===
openclaw_bridge:
  use_cron: false                  # Executions de recherche planifiees
  use_message: false               # Notifications de progression
  use_memory: false                # Persistance des connaissances inter-sessions
  use_sessions_spawn: false        # Lancement de sous-sessions paralleles
  use_web_fetch: false             # Recherche web en direct
  use_browser: false               # Collecte d'articles via navigateur
```

</details>

---

## 🙏 Remerciements

Inspire par :

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionnier de la recherche automatisee
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automatisation de la recherche de bout en bout
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Systeme de recherche entierement automatise

---

## 📄 Licence

MIT — voir [LICENSE](../LICENSE) pour les details.

---

## 📌 Citation

Si vous trouvez AutoResearchClaw utile, veuillez citer :

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Construit avec 🦞 par l'equipe AutoResearchClaw</sub>
</p>


================================================
FILE: docs/README_JA.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>アイデアを話す。論文を手に入れる。完全自動 & 自己進化。</b></h2>


<p align="center">
  <b><i><font size="5"><a href="#openclaw-統合">OpenClaw</a> にチャットするだけ：「Xを研究して」→ 完了。</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#テスト"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#openclaw-統合"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 論文ショーケース</a> · <a href="integration-guide.md">📖 統合ガイド</a> · <a href="https://discord.gg/u4ksqW5P">💬 Discordコミュニティ</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 生成論文ショーケース</b><br><br>
<b>8つの分野にわたる8本の論文</b> — 数学、統計、生物学、コンピューティング、NLP、RL、ビジョン、ロバスト性 — 人間の介入なしに完全自律生成。<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 テスターを募集しています！** あなた自身の研究アイデアで — どの分野からでも — パイプラインをお試しください。[ご意見をお聞かせください](TESTER_GUIDE.md)。あなたのフィードバックが次のバージョンに直接反映されます。 **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **クロスプラットフォーム対応 + 安定性大幅向上** — ACP互換AIエージェントバックエンド（Claude Code、Codex CLI、Copilot CLI、Gemini CLI、Kimi CLI）に対応し、OpenClawブリッジ経由でメッセージングプラットフォーム（Discord、Telegram、Lark、WeChat）もサポート。新しいCLIエージェントコード生成バックエンドにより、ステージ10と13を外部CLIエージェントに委任し、予算制御とタイムアウト管理に対応。反データ捏造システム（VerifiedRegistry + 実験診断・修復ループ）、100件以上のバグ修正、モジュラーexecutorリファクタリング、`--resume`自動検出、LLMリトライ強化、コミュニティ報告の修正を含む。
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ ワンコマンド。ワンペーパー。

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 これは何？

**あなたが考える。AutoResearchClawが書く。**

研究トピックを入力するだけで — OpenAlex、Semantic Scholar、arXivからの実際の文献、ハードウェア対応のサンドボックス実験（GPU/MPS/CPUを自動検出）、統計分析、マルチエージェント査読、NeurIPS/ICML/ICLR対応の学会グレードLaTeXを含む完全な学術論文が得られます。監視不要。コピペ不要。幻覚された参考文献なし。

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>完全な学術論文（序論、関連研究、手法、実験、結果、結論）</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>学会対応LaTeX（NeurIPS / ICLR / ICMLテンプレート）</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>OpenAlex、Semantic Scholar、arXivからの実際のBibTeX参考文献 — 本文中の引用に合わせて自動整理</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>4層の引用整合性 + 関連性検証（arXiv、CrossRef、DataCite、LLM）</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>生成されたコード + サンドボックス実行結果 + 構造化JSONメトリクス</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>誤差棒と信頼区間付きの条件比較チャートを自動生成</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>手法-証拠の一貫性チェック付きマルチエージェント査読</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>各実行から抽出された自己学習の教訓</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>すべての最終成果物を1フォルダに集約 — Overleafですぐにコンパイル可能</td></tr>
</table>

パイプラインは**人手の介入なしにエンドツーエンドで実行**されます。実験が失敗すれば自己修復します。仮説が成り立たなければ方向転換します。引用が偽物なら削除します。

🌍 **どこでも実行可能。** AutoResearchClaw は特定のプラットフォームに縛られません。CLI でスタンドアロン実行、[OpenClaw](https://github.com/openclaw/openclaw) に接続、または ACP 互換の AI エージェント —— 🤖 Claude Code、💻 Codex CLI、🐙 Copilot CLI、♊ Gemini CLI、🌙 Kimi CLI など —— と連携できます。さらに OpenClaw のメッセージブリッジにより、💬 Discord、✈️ Telegram、🐦 Lark（飛書）、💚 WeChat など、チームが普段使っているプラットフォームから研究を開始できます。トピックを入力すれば、論文が出力されます —— どこからでも。

---

## 🚀 クイックスタート

```bash
# 1. クローン & インストール
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. セットアップ（対話式 — OpenCode Beast Modeのインストール、Docker/LaTeXの確認）
researchclaw setup

# 3. 設定
researchclaw init          # 対話式：LLMプロバイダーを選択、config.arc.yamlを作成
# または手動：cp config.researchclaw.example.yaml config.arc.yaml

# 4. 実行
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

出力先 → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — コンパイル可能なLaTeX、BibTeX、実験コード、チャート。

<details>
<summary>📝 最小限の必要設定</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 他と何が違うのか

| 機能 | 仕組み |
|------|--------|
| **🔄 PIVOT / REFINE ループ** | ステージ15が自律的に判定：PROCEED、REFINE（パラメータ調整）、またはPIVOT（新方向）。成果物は自動バージョン管理。 |
| **🤖 マルチエージェント討論** | 仮説生成、結果分析、査読のそれぞれで構造化された多視点討論を実施。 |
| **🧬 自己学習** | 各実行から教訓を抽出（判定根拠、ランタイム警告、メトリクス異常）、30日の時間減衰付き。将来の実行が過去のミスから学習。 |
| **📚 知識ベース** | 各実行で6カテゴリ（判定、実験、発見、文献、質問、レビュー）にわたる構造化知識ベースを構築。 |
| **🛡️ Sentinel Watchdog** | バックグラウンド品質モニター：NaN/Inf検出、論文-証拠の一貫性、引用関連性スコアリング、捏造防止ガード。 |

---

## 🦞 OpenClaw統合

<table>
<tr>

**AutoResearchClawは[OpenClaw](https://github.com/openclaw/openclaw)互換サービスです。** OpenClawにインストールして、メッセージ1つで自律研究を開始できます — CLI、Claude Code、その他のAIコーディングアシスタントを使ってスタンドアロンでも利用可能です。

</tr>
</table>

### 🚀 OpenClawで使う（推奨）

[OpenClaw](https://github.com/openclaw/openclaw)をすでにAIアシスタントとしてお使いの場合：

```
1️⃣  GitHubリポジトリのURLをOpenClawに共有
2️⃣  OpenClawがRESEARCHCLAW_AGENTS.mdを自動読み込み → パイプラインを理解
3️⃣  「Research [あなたのトピック]」と話しかける
4️⃣  完了 — OpenClawがクローン、インストール、設定、実行、結果の返却まですべて自動実行
```

**以上です。** OpenClawが`git clone`、`pip install`、設定、パイプライン実行を自動的に処理します。チャットするだけです。

<details>
<summary>💡 内部で何が起きているか</summary>

1. OpenClawが`RESEARCHCLAW_AGENTS.md`を読み取り → 研究オーケストレーターの役割を学習
2. OpenClawが`README.md`を読み取り → インストールとパイプライン構造を理解
3. OpenClawが`config.researchclaw.example.yaml` → `config.yaml`にコピー
4. LLMのAPIキーを要求（または環境変数を使用）
5. `pip install -e .` + `researchclaw run --topic "..." --auto-approve`を実行
6. 論文、LaTeX、実験、引用を返却

</details>

### 🔌 OpenClaw Bridge（上級）

より深い統合のために、AutoResearchClawには6つのオプション機能を備えた**ブリッジアダプターシステム**が含まれています：

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ スケジュール実行
  use_message: true           # 💬 進捗通知（Discord/Slack/Telegram）
  use_memory: true            # 🧠 セッション間の知識永続化
  use_sessions_spawn: true    # 🔀 並列サブセッションの生成
  use_web_fetch: true         # 🌐 文献レビュー中のライブWeb検索
  use_browser: false          # 🖥️ ブラウザベースの論文収集
```

各フラグは型付きアダプタープロトコルをアクティブにします。OpenClawがこれらの機能を提供する場合、アダプターはコード変更なしにそれらを利用します。詳細は[`integration-guide.md`](integration-guide.md)をご覧ください。

### ACP (Agent Client Protocol)

AutoResearchClawは**任意のACP互換コーディングエージェント**をLLMバックエンドとして使用できます — APIキーは不要です。エージェントは[acpx](https://github.com/openclaw/acpx)を介して通信し、全23パイプラインステージにわたって単一の永続セッションを維持します。

| エージェント | コマンド | 備考 |
|-------------|---------|------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ACP例
llm:
  provider: "acp"
  acp:
    agent: "claude"   # 任意のACP互換エージェントCLIコマンド
    cwd: "."          # エージェントの作業ディレクトリ
  # base_urlやapi_keyは不要 — エージェントが独自の認証を処理します。
```

```bash
# そのまま実行 — エージェントは独自の認証情報を使用
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ その他の実行方法

| 方法 | 手順 |
|------|------|
| **スタンドアロンCLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | `RESEARCHCLAW_CLAUDE.md`を読み取り — *「Run research on [トピック]」*と言うだけ |
| **Copilot CLI** | `researchclaw run --topic "..."` で `llm.acp.agent: "gh"` を使用 |
| **OpenCode** | `.claude/skills/`を読み取り — 同じ自然言語インターフェース |
| **任意のAI CLI** | `RESEARCHCLAW_AGENTS.md`をコンテキストとして提供 → エージェントが自動ブートストラップ |

---

## 🔬 パイプライン：23ステージ、8フェーズ

```
フェーズ A: 研究スコーピング          フェーズ E: 実験実行
  1. TOPIC_INIT                      12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE               13. ITERATIVE_REFINE  ← 自己修復

フェーズ B: 文献探索                フェーズ F: 分析と判定
  3. SEARCH_STRATEGY                 14. RESULT_ANALYSIS    ← マルチエージェント
  4. LITERATURE_COLLECT  ← 実API    15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [ゲート]
  6. KNOWLEDGE_EXTRACT               フェーズ G: 論文執筆
                                     16. PAPER_OUTLINE
フェーズ C: 知識統合                  17. PAPER_DRAFT
  7. SYNTHESIS                       18. PEER_REVIEW        ← 証拠チェック
  8. HYPOTHESIS_GEN    ← 討論        19. PAPER_REVISION

フェーズ D: 実験設計               フェーズ H: 最終処理
  9. EXPERIMENT_DESIGN   [ゲート]     20. QUALITY_GATE      [ゲート]
 10. CODE_GENERATION                 21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING               22. EXPORT_PUBLISH     ← LaTeX
                                     23. CITATION_VERIFY    ← 関連性チェック
```

> **ゲートステージ**（5, 9, 20）は人間の承認を待つか、`--auto-approve`で自動承認されます。却下時にはパイプラインがロールバックします。

> **判定ループ**: ステージ15はREFINE（→ ステージ13）またはPIVOT（→ ステージ8）をトリガーでき、成果物のバージョン管理が自動的に行われます。

<details>
<summary>📋 各フェーズの詳細</summary>

| フェーズ | 処理内容 |
|---------|----------|
| **A: スコーピング** | LLMがトピックを研究質問を含む構造化された問題ツリーに分解 |
| **A+: ハードウェア** | GPU（NVIDIA CUDA / Apple MPS / CPUのみ）を自動検出、ローカルハードウェアが限定的な場合は警告、コード生成を適応 |
| **B: 文献** | マルチソース検索（OpenAlex → Semantic Scholar → arXiv）で実際の論文を取得、関連性でスクリーニング、知識カードを抽出 |
| **C: 統合** | 発見事項をクラスタリング、研究ギャップを特定、マルチエージェント討論で検証可能な仮説を生成 |
| **D: 設計** | 実験計画を設計、ハードウェア対応の実行可能Python（GPUティア→パッケージ選択）を生成、リソース需要を推定 |
| **E: 実行** | サンドボックスで実験を実行、NaN/Infとランタイムバグを検出、LLMによる的確な修復で自己修復 |
| **F: 分析** | マルチエージェントによる結果分析；根拠付きの自律的PROCEED / REFINE / PIVOT判定 |
| **G: 執筆** | アウトライン → セクション別ドラフト（5,000〜6,500語）→ 査読（手法-証拠の一貫性付き）→ 文字数ガード付き改訂 |
| **H: 最終処理** | 品質ゲート、知識アーカイブ、学会テンプレート付きLaTeXエクスポート、引用の整合性 + 関連性検証 |

</details>

---

## ✨ 主な機能

| 機能 | 説明 |
|------|------|
| **📚 マルチソース文献** | OpenAlex、Semantic Scholar、arXivからの実際の論文 — クエリ拡張、重複排除、三状態サーキットブレーカーとグレースフルデグラデーション |
| **🔍 4層引用検証** | arXiv IDチェック → CrossRef/DataCite DOI → Semantic Scholarタイトルマッチ → LLM関連性スコアリング。幻覚された参考文献は自動削除。 |
| **🖥️ ハードウェア対応実行** | GPU（NVIDIA CUDA / Apple MPS / CPUのみ）を自動検出し、コード生成、インポート、実験スケールを適応 |
| **🦾 OpenCode Beast Mode** | 複雑な実験を自動的に[OpenCode](https://github.com/anomalyco/opencode)にルーティング — カスタムアーキテクチャ、トレーニングループ、アブレーション研究を含むマルチファイルプロジェクトを生成。`researchclaw setup`でインストール。 |
| **🧪 サンドボックス実験** | AST検証済みコード、不変ハーネス、NaN/Inf早期停止、自己修復、反復的改良（最大10ラウンド）、部分結果の保持 |
| **📝 学会グレード執筆** | NeurIPS/ICML/ICLRテンプレート、セクション別ドラフト（5,000〜6,500語）、捏造防止ガード、改訂文字数ガード、免責事項抑制 |
| **📐 テンプレート切り替え** | `neurips_2025`、`iclr_2026`、`icml_2026` — Markdown → LaTeX（数式、表、図、相互参照、`\cite{}`対応） |
| **🚦 品質ゲート** | 3つのHuman-in-the-loopゲート（ステージ5, 9, 20）、ロールバック対応。`--auto-approve`でスキップ。 |

---

## 🧠 MetaClaw統合

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = すべての実行から学習するパイプライン。**

MetaClawはAutoResearchClawに**クロスラン知識転移**を追加します。有効にすると、パイプラインは失敗や警告から自動的に教訓を抽出し、再利用可能なスキルに変換し、後続の実行で全23ステージに注入します — 同じ過ちを二度と繰り返しません。

### 仕組み

```
Run N executes → failures/warnings captured as Lessons
                      ↓
          MetaClaw Lesson → Skill conversion
                      ↓
          arc-* Skill files stored in ~/.metaclaw/skills/
                      ↓
Run N+1 → build_overlay() injects skills into every LLM prompt
                      ↓
          LLM avoids known pitfalls → higher quality, fewer retries
```

### クイックセットアップ

```bash
# 1. MetaClawをインストール（未インストールの場合）
pip install metaclaw

# 2. 設定で有効化
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # MetaClawプロキシ（オプション）
  skills_dir: "~/.metaclaw/skills"          # スキルの保存場所
  fallback_url: "https://api.openai.com/v1" # 直接LLMフォールバック
  fallback_api_key: ""                      # フォールバックURLのAPIキー
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # warning + errorを変換
    max_skills_per_run: 3
```

```bash
# 3. 通常通り実行 — MetaClawは透過的に動作
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

各実行後、`~/.metaclaw/skills/arc-*/SKILL.md`を確認して、パイプラインが学習したスキルを確認できます。

### 実験結果

対照A/B実験（同じトピック、同じLLM、同じ設定）：

| メトリクス | ベースライン | MetaClaw使用時 | 改善 |
|-----------|------------|---------------|------|
| ステージリトライ率 | 10.5% | 7.9% | **-24.8%** |
| Refineサイクル数 | 2.0 | 1.2 | **-40.0%** |
| パイプラインステージ完了率 | 18/19 | 19/19 | **+5.3%** |
| 総合ロバスト性スコア（複合） | 0.714 | 0.845 | **+18.3%** |

> 複合ロバスト性スコアは、ステージ完了率（40%）、リトライ削減（30%）、Refineサイクル効率（30%）の加重平均です。

### 後方互換性

- **デフォルト: オフ。** `metaclaw_bridge`が存在しないか`enabled: false`の場合、パイプラインは以前と全く同じように動作します。
- **新しい依存関係なし。** MetaClawはオプションです — コアパイプラインはMetaClawなしで動作します。
- **既存の1,823テストすべてがパス**（統合コードを含む）。

---

## ⚙️ 設定リファレンス

<details>
<summary>クリックして設定リファレンスの全体を展開</summary>

```yaml
# === プロジェクト ===
project:
  name: "my-research"              # プロジェクト識別子
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === 研究 ===
research:
  topic: "..."                     # 研究トピック（必須）
  domains: ["ml", "nlp"]           # 文献検索の研究ドメイン
  daily_paper_count: 8             # 検索クエリあたりの目標論文数
  quality_threshold: 4.0           # 論文の最小品質スコア

# === ランタイム ===
runtime:
  timezone: "America/New_York"     # タイムスタンプ用
  max_parallel_tasks: 3            # 同時実験数の上限
  approval_timeout_hours: 12       # ゲートステージのタイムアウト
  retry_limit: 2                   # ステージ失敗時のリトライ回数

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # APIエンドポイント（openai-compatible必須）
  api_key_env: "OPENAI_API_KEY"    # APIキーの環境変数（openai-compatible必須）
  api_key: ""                      # またはここにキーを直接記入
  primary_model: "gpt-4o"          # プライマリモデル
  fallback_models: ["gpt-4o-mini"] # フォールバックチェーン
  s2_api_key: ""                   # Semantic Scholar APIキー（オプション、レート制限緩和）
  acp:                             # provider: "acp" の場合のみ使用
    agent: "claude"                # ACP Agent CLIコマンド（claude, codex, gemini等）
    cwd: "."                       # エージェントの作業ディレクトリ

# === 実験 ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # 実行あたりの最大実行時間（デフォルト: 300秒）
  max_iterations: 10               # 最大最適化反復回数
  metric_key: "val_loss"           # プライマリメトリクス名
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # importを自動検出 → requirements.txt
  ssh_remote:
    host: ""                       # GPUサーバーのホスト名
    gpu_ids: []                    # 利用可能なGPU ID
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode（`researchclaw setup`で自動インストール）
    enabled: true                    # マスタースイッチ（デフォルト: true）
    auto: true                       # 確認なしで自動トリガー（デフォルト: true）
    complexity_threshold: 0.2        # 0.0-1.0 — 高い = 複雑な実験のみトリガー
    model: ""                        # モデルのオーバーライド（空 = llm.primary_modelを使用）
    timeout_sec: 600                 # OpenCode生成の最大秒数
    max_retries: 1                   # 失敗時のリトライ回数
    workspace_cleanup: true          # 収集後に一時ワークスペースを削除

# === エクスポート ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === プロンプト ===
prompts:
  custom_file: ""                  # カスタムプロンプトYAMLのパス（空 = デフォルト）

# === セキュリティ ===
security:
  hitl_required_stages: [5, 9, 20] # 人間の承認が必要なステージ
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === 知識ベース ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === 通知 ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge（オプション）===
metaclaw_bridge:
  enabled: false                   # trueに設定してクロスラン学習を有効化
  proxy_url: "http://localhost:30000"  # MetaClawプロキシURL
  skills_dir: "~/.metaclaw/skills" # arc-*スキルの保存場所
  fallback_url: ""                 # プロキシがダウン時の直接LLMフォールバック
  fallback_api_key: ""             # フォールバックエンドポイントのAPIキー
  lesson_to_skill:
    enabled: true                  # 教訓をスキルに自動変換
    min_severity: "warning"        # 変換する最小重大度
    max_skills_per_run: 3          # パイプラン実行あたりの最大新規スキル数

# === OpenClaw Bridge ===
openclaw_bridge:
  use_cron: false                  # スケジュール研究実行
  use_message: false               # 進捗通知
  use_memory: false                # セッション間の知識永続化
  use_sessions_spawn: false        # 並列サブセッションの生成
  use_web_fetch: false             # ライブWeb検索
  use_browser: false               # ブラウザベースの論文収集
```

</details>

---

## 🙏 謝辞

以下のプロジェクトに着想を得ています：

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — 自動研究のパイオニア
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — エンドツーエンドの研究自動化
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — 完全自動研究システム

---

## 📄 ライセンス

MIT — 詳細は[LICENSE](../LICENSE)をご覧ください。

---

## 📌 引用

AutoResearchClawが役に立った場合は、以下を引用してください：

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Built with 🦞 by the AutoResearchClaw team</sub>
</p>


================================================
FILE: docs/README_KO.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>아이디어를 말하다. 논문을 받다. 완전 자동 & 자기 진화.</b></h2>


<p align="center">
  <b><i><font size="5"><a href="#openclaw-통합">OpenClaw</a>에 채팅하세요: "X 연구해줘" → 완료.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#테스트"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#openclaw-통합"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 논문 쇼케이스</a> · <a href="integration-guide.md">📖 통합 가이드</a> · <a href="https://discord.gg/u4ksqW5P">💬 Discord 커뮤니티</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Sample Paper"/></a>
</td>
<td valign="middle">
<b>🏆 생성된 논문 쇼케이스</b><br><br>
<b>8개 분야에 걸친 8편의 논문</b> — 수학, 통계, 생물학, 컴퓨팅, NLP, RL, 비전, 견고성 — 인간 개입 없이 완전 자율 생성.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/View_Full_Showcase_→-All_8_Papers-d73a49?style=for-the-badge" alt="View Showcase"></a>
</td>
</tr>
</table>

---

> **🧪 테스터를 모집합니다!** 여러분의 연구 아이디어로 — 어떤 분야든 — 파이프라인을 시험해 보시고 [의견을 들려주세요](TESTER_GUIDE.md). 여러분의 피드백이 다음 버전에 직접 반영됩니다. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **크로스 플랫폼 지원 + 주요 안정성 개선** — ACP 호환 AI 에이전트 백엔드(Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) 지원 및 OpenClaw 브릿지를 통한 메시징 플랫폼(Discord, Telegram, Lark, WeChat) 지원 추가. 새로운 CLI-agent 코드 생성 백엔드가 Stage 10 및 13을 외부 CLI 에이전트에 위임하며, 예산 제어 및 타임아웃 관리를 지원. 반데이터 조작 시스템(VerifiedRegistry + 실험 진단 및 복구 루프), 100건 이상의 버그 수정, 모듈러 executor 리팩토링, `--resume` 자동 감지, LLM 재시도 강화, 커뮤니티 보고 수정 포함.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ 하나의 명령. 하나의 논문.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 이것은 무엇인가요?

**당신이 생각하면, AutoResearchClaw가 씁니다.**

연구 주제를 입력하면 — OpenAlex, Semantic Scholar, arXiv의 실제 문헌, 하드웨어 인식 샌드박스 실험 (GPU/MPS/CPU 자동 감지), 통계 분석, 멀티 에이전트 피어 리뷰, NeurIPS/ICML/ICLR 대상 학회 수준 LaTeX를 포함한 완전한 학술 논문을 받을 수 있습니다. 관리가 필요 없습니다. 복사-붙여넣기도 필요 없습니다. 환각된 참고문헌도 없습니다.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>완성된 학술 논문 (서론, 관련 연구, 방법론, 실험, 결과, 결론)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>학회 제출용 LaTeX (NeurIPS / ICLR / ICML 템플릿)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>OpenAlex, Semantic Scholar, arXiv에서 가져온 실제 BibTeX 참고문헌 — 인라인 인용과 일치하도록 자동 정리</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>4계층 인용 무결성 + 관련성 검증 (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>생성된 코드 + 샌드박스 결과 + 구조화된 JSON 메트릭</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>오차 막대와 신뢰 구간이 포함된 자동 생성 조건 비교 차트</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>방법론-증거 일관성 검사를 포함한 멀티 에이전트 피어 리뷰</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>각 실행에서 추출된 자기 학습 교훈</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>모든 최종 산출물을 하나의 폴더에 — Overleaf에 바로 컴파일 가능</td></tr>
</table>

파이프라인은 **사람의 개입 없이 처음부터 끝까지 실행**됩니다. 실험이 실패하면 자가 복구합니다. 가설이 성립하지 않으면 방향을 전환합니다. 인용이 가짜면 삭제합니다.

🌍 **어디서든 실행 가능.** AutoResearchClaw는 특정 플랫폼에 종속되지 않습니다. CLI로 독립 실행하거나, [OpenClaw](https://github.com/openclaw/openclaw)에 연결하거나, ACP 호환 AI 에이전트 —— 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI 등 —— 와 연동할 수 있습니다. OpenClaw의 메시지 브릿지 덕분에 💬 Discord, ✈️ Telegram, 🐦 Lark(飞书), 💚 WeChat 등 팀이 이미 사용 중인 플랫폼에서 연구를 시작할 수 있습니다. 주제 하나 입력하면 논문 하나 완성 — 어디서 입력하든 상관없습니다.

---

## 🚀 빠른 시작

```bash
# 1. 클론 & 설치
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. 설정 (대화형 — OpenCode Beast Mode 설치, Docker/LaTeX 확인)
researchclaw setup

# 3. 구성
researchclaw init          # 대화형: LLM 제공자 선택, config.arc.yaml 생성
# 또는 수동: cp config.researchclaw.example.yaml config.arc.yaml

# 4. 실행
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

출력 → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — 컴파일 가능한 LaTeX, BibTeX, 실험 코드, 차트.

<details>
<summary>📝 최소 필수 설정</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 차별화 요소

| 기능 | 작동 방식 |
|------|----------|
| **🔄 PIVOT / REFINE 루프** | 15단계에서 자율적으로 결정: PROCEED, REFINE (매개변수 조정), 또는 PIVOT (새 방향). 산출물 자동 버전 관리. |
| **🤖 멀티 에이전트 토론** | 가설 생성, 결과 분석, 피어 리뷰 각각에서 구조화된 다관점 토론을 수행. |
| **🧬 자기 학습** | 각 실행에서 교훈 추출 (의사결정 근거, 런타임 경고, 메트릭 이상), 30일 시간 감쇠. 향후 실행이 과거의 실수에서 학습. |
| **📚 지식 기반** | 각 실행에서 6개 카테고리 (결정, 실험, 발견, 문헌, 질문, 리뷰)에 걸친 구조화된 지식 기반 구축. |
| **🛡️ 센티넬 감시견** | 백그라운드 품질 모니터: NaN/Inf 감지, 논문-증거 일관성, 인용 관련성 점수, 날조 방지 가드. |

---

## 🦞 OpenClaw 통합

<table>
<tr>

**AutoResearchClaw는 [OpenClaw](https://github.com/openclaw/openclaw) 호환 서비스입니다.** OpenClaw에 설치하고 단일 메시지로 자율 연구를 시작하거나 — CLI, Claude Code 또는 기타 AI 코딩 어시스턴트를 통해 독립적으로 사용하세요.

</tr>
</table>

### 🚀 OpenClaw와 함께 사용 (권장)

[OpenClaw](https://github.com/openclaw/openclaw)을 이미 AI 어시스턴트로 사용하고 있다면:

```
1️⃣  GitHub 저장소 URL을 OpenClaw에 공유
2️⃣  OpenClaw이 자동으로 RESEARCHCLAW_AGENTS.md를 읽고 → 파이프라인을 이해
3️⃣  "Research [주제]"라고 말하기
4️⃣  완료 — OpenClaw이 클론, 설치, 설정, 실행, 결과 반환까지 자동 처리
```

**그게 전부입니다.** OpenClaw이 `git clone`, `pip install`, 설정 구성, 파이프라인 실행을 자동으로 처리합니다. 채팅만 하면 됩니다.

<details>
<summary>💡 내부 동작 과정</summary>

1. OpenClaw이 `RESEARCHCLAW_AGENTS.md`를 읽고 → 연구 오케스트레이터 역할을 학습
2. OpenClaw이 `README.md`를 읽고 → 설치 및 파이프라인 구조를 이해
3. OpenClaw이 `config.researchclaw.example.yaml`을 → `config.yaml`로 복사
4. LLM API 키를 요청 (또는 환경 변수를 사용)
5. `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 실행
6. 논문, LaTeX, 실험, 인용을 반환

</details>

### 🔌 OpenClaw 브릿지 (고급)

더 깊은 통합을 위해 AutoResearchClaw는 6가지 선택적 기능을 갖춘 **브릿지 어댑터 시스템**을 포함합니다:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ 예약된 연구 실행
  use_message: true           # 💬 진행 상황 알림 (Discord/Slack/Telegram)
  use_memory: true            # 🧠 세션 간 지식 영속성
  use_sessions_spawn: true    # 🔀 동시 단계를 위한 병렬 서브세션 생성
  use_web_fetch: true         # 🌐 문헌 검토 중 실시간 웹 검색
  use_browser: false          # 🖥️ 브라우저 기반 논문 수집
```

각 플래그는 타입이 지정된 어댑터 프로토콜을 활성화합니다. OpenClaw이 이러한 기능을 제공하면 어댑터가 코드 변경 없이 이를 소비합니다. 전체 세부 사항은 [`integration-guide.md`](integration-guide.md)를 참조하세요.

### ACP (Agent Client Protocol)

AutoResearchClaw는 **모든 ACP 호환 코딩 에이전트**를 LLM 백엔드로 사용할 수 있습니다 — API 키가 필요 없습니다. 에이전트는 [acpx](https://github.com/openclaw/acpx)를 통해 통신하며, 전체 23개 파이프라인 단계에 걸쳐 단일 영구 세션을 유지합니다.

| 에이전트 | 명령어 | 비고 |
|---------|--------|------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — ACP 예시
llm:
  provider: "acp"
  acp:
    agent: "claude"   # 모든 ACP 호환 에이전트 CLI 명령어
    cwd: "."          # 에이전트의 작업 디렉토리
  # base_url이나 api_key 불필요 — 에이전트가 자체 인증을 처리합니다.
```

```bash
# 바로 실행 — 에이전트가 자체 자격 증명 사용
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ 기타 실행 방법

| 방법 | 사용법 |
|------|--------|
| **독립형 CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | `RESEARCHCLAW_CLAUDE.md`를 읽음 — *"Run research on [주제]"*라고 말하기 |
| **Copilot CLI** | `researchclaw run --topic "..."` 에 `llm.acp.agent: "gh"` 사용 |
| **OpenCode** | `.claude/skills/`를 읽음 — 동일한 자연어 인터페이스 |
| **기타 AI CLI** | `RESEARCHCLAW_AGENTS.md`를 컨텍스트로 제공 → 에이전트가 자동 부트스트랩 |

---

## 🔬 파이프라인: 23단계, 8페이즈

```
페이즈 A: 연구 범위 설정            페이즈 E: 실험 실행
  1. TOPIC_INIT                      12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE               13. ITERATIVE_REFINE  ← 자가 복구

페이즈 B: 문헌 탐색                페이즈 F: 분석 및 의사결정
  3. SEARCH_STRATEGY                 14. RESULT_ANALYSIS    ← 멀티 에이전트
  4. LITERATURE_COLLECT  ← 실제 API  15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [게이트]
  6. KNOWLEDGE_EXTRACT               페이즈 G: 논문 작성
                                     16. PAPER_OUTLINE
페이즈 C: 지식 종합                   17. PAPER_DRAFT
  7. SYNTHESIS                       18. PEER_REVIEW        ← 증거 확인
  8. HYPOTHESIS_GEN    ← 토론        19. PAPER_REVISION

페이즈 D: 실험 설계               페이즈 H: 최종화
  9. EXPERIMENT_DESIGN   [게이트]      20. QUALITY_GATE      [게이트]
 10. CODE_GENERATION                 21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING               22. EXPORT_PUBLISH     ← LaTeX
                                     23. CITATION_VERIFY    ← 관련성 확인
```

> **게이트 단계** (5, 9, 20)는 사람의 승인을 기다리거나 `--auto-approve`로 자동 승인합니다. 거부 시 파이프라인이 롤백됩니다.

> **의사결정 루프**: 15단계에서 REFINE (→ 13단계) 또는 PIVOT (→ 8단계)을 트리거할 수 있으며, 산출물 버전 관리가 자동으로 이루어집니다.

<details>
<summary>📋 각 페이즈별 상세 설명</summary>

| 페이즈 | 수행 내용 |
|--------|----------|
| **A: 범위 설정** | LLM이 주제를 연구 질문이 포함된 구조화된 문제 트리로 분해 |
| **A+: 하드웨어** | GPU 자동 감지 (NVIDIA CUDA / Apple MPS / CPU 전용), 로컬 하드웨어가 제한적인 경우 경고, 이에 맞게 코드 생성 적응 |
| **B: 문헌** | 다중 소스 검색 (OpenAlex → Semantic Scholar → arXiv)으로 실제 논문 수집, 관련성별 선별, 지식 카드 추출 |
| **C: 종합** | 연구 결과 클러스터링, 연구 갭 식별, 멀티 에이전트 토론을 통한 검증 가능한 가설 생성 |
| **D: 설계** | 실험 계획 설계, 하드웨어 인식 실행 가능 Python 생성 (GPU 등급 → 패키지 선택), 리소스 요구 사항 추정 |
| **E: 실행** | 샌드박스에서 실험 실행, NaN/Inf 및 런타임 버그 감지, LLM을 통한 표적화된 코드 자가 복구 |
| **F: 분석** | 결과에 대한 멀티 에이전트 분석; 근거가 포함된 자율 PROCEED / REFINE / PIVOT 결정 |
| **G: 작성** | 개요 → 섹션별 작성 (5,000-6,500단어) → 피어 리뷰 (방법론-증거 일관성 포함) → 길이 제한 적용 수정 |
| **H: 최종화** | 품질 게이트, 지식 아카이빙, 학회 템플릿 포함 LaTeX 내보내기, 인용 무결성 + 관련성 검증 |

</details>

---

## ✨ 주요 기능

| 기능 | 설명 |
|------|------|
| **📚 다중 소스 문헌** | OpenAlex, Semantic Scholar, arXiv에서 실제 논문 — 쿼리 확장, 중복 제거, 3상태 서킷 브레이커와 단계적 성능 저하 |
| **🔍 4계층 인용 검증** | arXiv ID 확인 → CrossRef/DataCite DOI → Semantic Scholar 제목 매칭 → LLM 관련성 점수. 환각된 참고문헌 자동 삭제. |
| **🖥️ 하드웨어 인식 실행** | GPU (NVIDIA CUDA / Apple MPS / CPU 전용) 자동 감지, 이에 맞게 코드 생성, import, 실험 규모 적응 |
| **🦾 OpenCode Beast Mode** | 복잡한 실험을 [OpenCode](https://github.com/anomalyco/opencode)로 자동 라우팅 — 커스텀 아키텍처, 학습 루프, 절제 연구가 포함된 다중 파일 프로젝트 생성. `researchclaw setup`으로 설치. |
| **🧪 샌드박스 실험** | AST 검증 코드, 불변 하네스, NaN/Inf 즉시 실패, 자가 복구, 반복적 개선 (최대 10라운드), 부분 결과 캡처 |
| **📝 학회 수준 작성** | NeurIPS/ICML/ICLR 템플릿, 섹션별 작성 (5,000-6,500단어), 날조 방지 가드, 수정 길이 제한, 면책 조항 방지 적용 |
| **📐 템플릿 전환** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX (수학, 표, 그림, 교차 참조, `\cite{}` 포함) |
| **🚦 품질 게이트** | 3개의 Human-in-the-loop 게이트 (단계 5, 9, 20), 롤백 지원. `--auto-approve`로 건너뛰기. |

---

## 🧠 MetaClaw 통합

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = 모든 실행에서 학습하는 파이프라인.**

MetaClaw는 AutoResearchClaw에 **교차 실행 지식 전이**를 추가합니다. 활성화되면 파이프라인이 실패와 경고에서 자동으로 교훈을 추출하고, 이를 재사용 가능한 스킬로 변환하여 후속 실행의 전체 23단계에 주입합니다 — 같은 실수를 다시 반복하지 않습니다.

### 작동 방식

```
Run N executes → failures/warnings captured as Lessons
                      ↓
          MetaClaw Lesson → Skill conversion
                      ↓
          arc-* Skill files stored in ~/.metaclaw/skills/
                      ↓
Run N+1 → build_overlay() injects skills into every LLM prompt
                      ↓
          LLM avoids known pitfalls → higher quality, fewer retries
```

### 빠른 설정

```bash
# 1. MetaClaw 설치 (미설치 시)
pip install metaclaw

# 2. 설정에서 활성화
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # MetaClaw 프록시 (선택)
  skills_dir: "~/.metaclaw/skills"          # 스킬 저장 위치
  fallback_url: "https://api.openai.com/v1" # 직접 LLM 폴백
  fallback_api_key: ""                      # 폴백 URL의 API 키
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # warning + error 변환
    max_skills_per_run: 3
```

```bash
# 3. 평소대로 실행 — MetaClaw가 투명하게 작동
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

각 실행 후 `~/.metaclaw/skills/arc-*/SKILL.md`를 확인하여 파이프라인이 학습한 스킬을 확인하세요.

### 실험 결과

대조 A/B 실험 (동일 주제, 동일 LLM, 동일 설정):

| 메트릭 | 기준선 | MetaClaw 사용 시 | 개선 |
|--------|--------|-----------------|------|
| 단계 재시도율 | 10.5% | 7.9% | **-24.8%** |
| Refine 사이클 수 | 2.0 | 1.2 | **-40.0%** |
| 파이프라인 단계 완료율 | 18/19 | 19/19 | **+5.3%** |
| 전체 견고성 점수 (종합) | 0.714 | 0.845 | **+18.3%** |

> 종합 견고성 점수는 단계 완료율 (40%), 재시도 감소 (30%), Refine 사이클 효율성 (30%)의 가중 평균입니다.

### 하위 호환성

- **기본값: 꺼짐.** `metaclaw_bridge`가 없거나 `enabled: false`이면 파이프라인은 이전과 정확히 동일하게 동작합니다.
- **새로운 종속성 없음.** MetaClaw는 선택 사항입니다 — 핵심 파이프라인은 MetaClaw 없이도 동작합니다.
- **기존 1,823개 테스트 모두 통과** (통합 코드 포함).

---

## ⚙️ 설정 참고서

<details>
<summary>전체 설정 참고서 펼치기</summary>

```yaml
# === 프로젝트 ===
project:
  name: "my-research"              # 프로젝트 식별자
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === 연구 ===
research:
  topic: "..."                     # 연구 주제 (필수)
  domains: ["ml", "nlp"]           # 문헌 검색용 연구 분야
  daily_paper_count: 8             # 검색 쿼리당 목표 논문 수
  quality_threshold: 4.0           # 논문 최소 품질 점수

# === 런타임 ===
runtime:
  timezone: "America/New_York"     # 타임스탬프용
  max_parallel_tasks: 3            # 동시 실험 제한
  approval_timeout_hours: 12       # 게이트 단계 타임아웃
  retry_limit: 2                   # 단계 실패 시 재시도 횟수

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # API 엔드포인트 (openai-compatible 필수)
  api_key_env: "OPENAI_API_KEY"    # API 키용 환경 변수 (openai-compatible 필수)
  api_key: ""                      # 또는 키를 직접 입력
  primary_model: "gpt-4o"          # 기본 모델
  fallback_models: ["gpt-4o-mini"] # 폴백 체인
  s2_api_key: ""                   # Semantic Scholar API 키 (선택, 더 높은 속도 제한)
  acp:                             # provider: "acp" 인 경우에만 사용
    agent: "claude"                # ACP 에이전트 CLI 명령어 (claude, codex, gemini 등)
    cwd: "."                       # 에이전트의 작업 디렉토리

# === 실험 ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # 실행당 최대 실행 시간 (기본값: 300초)
  max_iterations: 10               # 최대 최적화 반복 횟수
  metric_key: "val_loss"           # 기본 메트릭 이름
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # import 자동 감지 → requirements.txt
  ssh_remote:
    host: ""                       # GPU 서버 호스트명
    gpu_ids: []                    # 사용 가능한 GPU ID
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (`researchclaw setup`으로 자동 설치)
    enabled: true                    # 마스터 스위치 (기본값: true)
    auto: true                       # 확인 없이 자동 트리거 (기본값: true)
    complexity_threshold: 0.2        # 0.0-1.0 — 높을수록 복잡한 실험에서만 트리거
    model: ""                        # 모델 오버라이드 (비어있으면 llm.primary_model 사용)
    timeout_sec: 600                 # OpenCode 생성 최대 초
    max_retries: 1                   # 실패 시 재시도 횟수
    workspace_cleanup: true          # 수집 후 임시 작업 공간 제거

# === 내보내기 ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === 프롬프트 ===
prompts:
  custom_file: ""                  # 사용자 정의 프롬프트 YAML 경로 (비어 있으면 기본값)

# === 보안 ===
security:
  hitl_required_stages: [5, 9, 20] # 사람의 승인이 필요한 단계
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === 지식 기반 ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === 알림 ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge (선택) ===
metaclaw_bridge:
  enabled: false                   # true로 설정하여 교차 실행 학습 활성화
  proxy_url: "http://localhost:30000"  # MetaClaw 프록시 URL
  skills_dir: "~/.metaclaw/skills" # arc-* 스킬 저장 위치
  fallback_url: ""                 # 프록시 장애 시 직접 LLM 폴백
  fallback_api_key: ""             # 폴백 엔드포인트의 API 키
  lesson_to_skill:
    enabled: true                  # 교훈을 스킬로 자동 변환
    min_severity: "warning"        # 변환할 최소 심각도
    max_skills_per_run: 3          # 파이프라인 실행당 최대 새 스킬 수

# === OpenClaw 브릿지 ===
openclaw_bridge:
  use_cron: false                  # 예약된 연구 실행
  use_message: false               # 진행 상황 알림
  use_memory: false                # 세션 간 지식 영속성
  use_sessions_spawn: false        # 병렬 서브세션 생성
  use_web_fetch: false             # 실시간 웹 검색
  use_browser: false               # 브라우저 기반 논문 수집
```

</details>

---

## 🙏 감사의 말

다음 프로젝트에서 영감을 받았습니다:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — 자동화 연구의 선구자
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — 엔드투엔드 연구 자동화
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — 완전 자동 연구 시스템

---

## 📄 라이선스

MIT — 자세한 내용은 [LICENSE](../LICENSE)를 참조하세요.

---

## 📌 인용

AutoResearchClaw가 유용했다면, 아래를 인용해 주세요:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Built with 🦞 by the AutoResearchClaw team</sub>
</p>


================================================
FILE: docs/README_PT.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Converse uma ideia. Receba um artigo. Totalmente autônomo & autoevolutivo.</b></h2>


<p align="center">
  <b><i><font size="5">Converse com o <a href="#integração-openclaw">OpenClaw</a>: "Pesquise X" → pronto.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>


<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#testes"><img src="https://img.shields.io/badge/Tests-1823%20passed-brightgreen?logo=pytest&logoColor=white" alt="1823 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#integração-openclaw"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 Galeria de Artigos</a> · <a href="integration-guide.md">📖 Guia de Integração</a> · <a href="https://discord.gg/u4ksqW5P">💬 Comunidade Discord</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Artigo Exemplo"/></a>
</td>
<td valign="middle">
<b>🏆 Galeria de Artigos Gerados</b><br><br>
<b>8 artigos em 8 domínios</b> — matemática, estatística, biologia, computação, NLP, RL, visão, robustez — gerados de forma totalmente autônoma sem intervenção humana.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/Ver_Galeria_Completa_→-Todos_os_8_Artigos-d73a49?style=for-the-badge" alt="Ver Galeria"></a>
</td>
</tr>
</table>

---

> **🧪 Estamos procurando testadores!** Experimente o pipeline com sua própria ideia de pesquisa — de qualquer área — e [diga-nos o que achou](TESTER_GUIDE.md). Seu feedback molda diretamente a próxima versão. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 News
- **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Suporte multiplataforma + grande estabilidade** — O AutoResearchClaw agora funciona com qualquer agente compativel com ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) e suporta plataformas de mensagens (Discord, Telegram, Lark, WeChat) via ponte OpenClaw. Novo backend de geracao de codigo CLI-agent que delega os Stages 10 e 13 a agentes CLI externos com controle de orcamento e gerenciamento de timeout. Inclui sistema anti-fabricacao (VerifiedRegistry + loop de diagnostico e reparo), 100+ correcoes de bugs, refatoracao modular do executor, auto-deteccao de `--resume`, endurecimento de retries LLM e correcoes da comunidade.
- **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit.
- **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration).
- **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs.
- **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required.

---

## ⚡ Um Comando. Um Artigo.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve
```


---

## 🤔 O Que É Isto?

**Você pensa. AutoResearchClaw escreve.**

Forneça um tópico de pesquisa — receba de volta um artigo acadêmico completo com literatura real do OpenAlex, Semantic Scholar & arXiv, experimentos em sandbox com detecção automática de hardware (GPU/MPS/CPU), análise estatística, revisão por pares multi-agente, e LaTeX pronto para conferência mirando NeurIPS/ICML/ICLR. Sem babá. Sem copiar e colar. Sem referências alucinadas.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Artigo acadêmico completo (Introdução, Trabalhos Relacionados, Método, Experimentos, Resultados, Conclusão)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>LaTeX pronto para conferência (templates NeurIPS / ICLR / ICML)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>Referências BibTeX reais do OpenAlex, Semantic Scholar e arXiv — auto-podadas para corresponder às citações inline</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>Verificação de integridade + relevância de citações em 4 camadas (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Código gerado + resultados do sandbox + métricas JSON estruturadas</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Gráficos de comparação de condições gerados automaticamente com barras de erro e intervalos de confiança</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Revisão por pares multi-agente com verificações de consistência metodologia-evidência</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Lições de autoaprendizagem extraídas de cada execução</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>Todas as saídas finais em uma pasta — pronto para compilar no Overleaf</td></tr>
</table>

O pipeline roda **de ponta a ponta sem intervenção humana**. Quando experimentos falham, ele se auto-repara. Quando hipóteses não se sustentam, ele pivota. Quando citações são falsas, ele as elimina.

🌍 **Execute em qualquer lugar.** O AutoResearchClaw não está preso a uma única plataforma. Use-o de forma independente via CLI, conecte-o ao [OpenClaw](https://github.com/openclaw/openclaw), ou integre-o com qualquer agente compatível com ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, e muito mais. Graças à ponte de mensagens do OpenClaw, você pode iniciar uma pesquisa completa pelo 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, ou qualquer plataforma que sua equipe já utiliza. Um tópico na entrada, um artigo na saída — não importa de onde você digita.

---

## 🚀 Início Rápido

```bash
# 1. Clone & instale
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Setup (interativo — instala OpenCode beast mode, verifica Docker/LaTeX)
researchclaw setup

# 3. Configure
researchclaw init          # Interativo: escolha provedor LLM, cria config.arc.yaml
# Ou manualmente: cp config.researchclaw.example.yaml config.arc.yaml

# 4. Execute
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
```

Saída → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — LaTeX, BibTeX, código de experimentos, gráficos prontos para compilação.

<details>
<summary>📝 Configuração mínima necessária</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Your research topic here"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 O Que o Torna Diferente

| Capacidade | Como Funciona |
|-----------|-------------|
| **🔄 Loop PIVOT / REFINE** | O Estágio 15 decide autonomamente: PROCEED, REFINE (ajustar parâmetros) ou PIVOT (nova direção). Artefatos versionados automaticamente. |
| **🤖 Debate Multi-Agente** | Geração de hipóteses, análise de resultados e revisão por pares usam debate estruturado com múltiplas perspectivas. |
| **🧬 Autoaprendizagem** | Lições extraídas por execução (justificativa de decisões, avisos de runtime, anomalias em métricas) com decaimento temporal de 30 dias. Execuções futuras aprendem com erros passados. |
| **📚 Base de Conhecimento** | Cada execução constrói uma KB estruturada com 6 categorias (decisões, experimentos, descobertas, literatura, questões, revisões). |
| **🛡️ Sentinel Watchdog** | Monitor de qualidade em segundo plano: detecção de NaN/Inf, consistência artigo-evidência, pontuação de relevância de citações, guarda anti-fabricação. |

---

## 🦞 Integração OpenClaw

<table>
<tr>

**AutoResearchClaw é um serviço compatível com [OpenClaw](https://github.com/openclaw/openclaw).** Instale-o no OpenClaw e inicie pesquisa autônoma com uma única mensagem — ou use-o de forma independente via CLI, Claude Code ou qualquer assistente de codificação IA.

</tr>
</table>

### 🚀 Usar com OpenClaw (Recomendado)

Se você já usa o [OpenClaw](https://github.com/openclaw/openclaw) como seu assistente de IA:

```
1️⃣  Compartilhe a URL do repositório GitHub com o OpenClaw
2️⃣  O OpenClaw lê automaticamente RESEARCHCLAW_AGENTS.md → entende o pipeline
3️⃣  Diga: "Pesquise [seu tópico]"
4️⃣  Pronto — o OpenClaw clona, instala, configura, executa e retorna os resultados
```

**É isso.** O OpenClaw gerencia `git clone`, `pip install`, configuração e execução do pipeline automaticamente. Você apenas conversa.

<details>
<summary>💡 O que acontece por baixo dos panos</summary>

1. O OpenClaw lê `RESEARCHCLAW_AGENTS.md` → aprende o papel de orquestrador de pesquisa
2. O OpenClaw lê `README.md` → entende a instalação e estrutura do pipeline
3. O OpenClaw copia `config.researchclaw.example.yaml` → `config.yaml`
4. Solicita sua chave de API do LLM (ou usa sua variável de ambiente)
5. Executa `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. Retorna o artigo, LaTeX, experimentos e citações

</details>

### 🔌 Bridge OpenClaw (Avançado)

Para integração mais profunda, o AutoResearchClaw inclui um **sistema de adaptadores bridge** com 6 capacidades opcionais:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Execuções de pesquisa agendadas
  use_message: true           # 💬 Notificações de progresso (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Persistência de conhecimento entre sessões
  use_sessions_spawn: true    # 🔀 Criar sub-sessões paralelas para estágios concorrentes
  use_web_fetch: true         # 🌐 Busca web ao vivo durante revisão de literatura
  use_browser: false          # 🖥️ Coleta de artigos baseada em navegador
```

Cada flag ativa um protocolo de adaptador tipado. Quando o OpenClaw fornece essas capacidades, os adaptadores as consomem sem alterações no código. Consulte [`integration-guide.md`](integration-guide.md) para detalhes completos.

### ACP (Agent Client Protocol)

O AutoResearchClaw pode usar **qualquer agente de codificação compatível com ACP** como seu backend LLM — sem necessidade de chaves de API. O agente se comunica via [acpx](https://github.com/openclaw/acpx), mantendo uma única sessão persistente ao longo de todos os 23 estágios do pipeline.

| Agente | Comando | Notas |
|-------|---------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — exemplo ACP
llm:
  provider: "acp"
  acp:
    agent: "claude"   # Qualquer comando CLI de agente compatível com ACP
    cwd: "."          # Diretório de trabalho para o agente
  # Sem base_url ou api_key necessários — o agente gerencia sua própria autenticação.
```

```bash
# Basta executar — o agente usa suas próprias credenciais
researchclaw run --config config.yaml --topic "Your research idea" --auto-approve
```

### 🛠️ Outras Formas de Executar

| Método | Como |
|--------|------|
| **CLI Independente** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **API Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Lê `RESEARCHCLAW_CLAUDE.md` — basta dizer *"Execute pesquisa sobre [tópico]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` com `llm.acp.agent: "gh"` |
| **OpenCode** | Lê `.claude/skills/` — mesma interface em linguagem natural |
| **Qualquer CLI de IA** | Forneça `RESEARCHCLAW_AGENTS.md` como contexto → o agente faz bootstrap automaticamente |

---

## 🔬 Pipeline: 23 Estágios, 8 Fases

```
Fase A: Escopo da Pesquisa           Fase E: Execução de Experimentos
  1. TOPIC_INIT                        12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE                 13. ITERATIVE_REFINE  ← auto-reparo

Fase B: Descoberta de Literatura     Fase F: Análise & Decisão
  3. SEARCH_STRATEGY                   14. RESULT_ANALYSIS    ← multi-agente
  4. LITERATURE_COLLECT  ← API real    15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [gate]
  6. KNOWLEDGE_EXTRACT                 Fase G: Escrita do Artigo
                                       16. PAPER_OUTLINE
Fase C: Síntese de Conhecimento       17. PAPER_DRAFT
  7. SYNTHESIS                         18. PEER_REVIEW        ← verif. evidência
  8. HYPOTHESIS_GEN    ← debate        19. PAPER_REVISION

Fase D: Design de Experimentos      Fase H: Finalização
  9. EXPERIMENT_DESIGN   [gate]        20. QUALITY_GATE      [gate]
 10. CODE_GENERATION                   21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING                 22. EXPORT_PUBLISH     ← LaTeX
                                       23. CITATION_VERIFY    ← verif. relevância
```

> **Estágios gate** (5, 9, 20) pausam para aprovação humana ou aprovam automaticamente com `--auto-approve`. Em caso de rejeição, o pipeline faz rollback.

> **Loops de decisão**: O Estágio 15 pode acionar REFINE (→ Estágio 13) ou PIVOT (→ Estágio 8), com versionamento automático de artefatos.

<details>
<summary>📋 O Que Cada Fase Faz</summary>

| Fase | O Que Acontece |
|------|----------------|
| **A: Escopo** | O LLM decompõe o tópico em uma árvore de problemas estruturada com questões de pesquisa |
| **A+: Hardware** | Detecta automaticamente GPU (NVIDIA CUDA / Apple MPS / apenas CPU), avisa se o hardware local é limitado, adapta a geração de código adequadamente |
| **B: Literatura** | Busca multi-fonte (OpenAlex → Semantic Scholar → arXiv) por artigos reais, triagem por relevância, extração de fichas de conhecimento |
| **C: Síntese** | Agrupa descobertas, identifica lacunas de pesquisa, gera hipóteses testáveis via debate multi-agente |
| **D: Design** | Projeta plano de experimento, gera Python executável com consciência de hardware (tier de GPU → seleção de pacotes), estima necessidades de recursos |
| **E: Execução** | Executa experimentos em sandbox, detecta NaN/Inf e bugs de runtime, auto-repara código via reparo direcionado por LLM |
| **F: Análise** | Análise multi-agente dos resultados; decisão autônoma PROCEED / REFINE / PIVOT com justificativa |
| **G: Escrita** | Outline → redação seção por seção (5.000-6.500 palavras) → revisão por pares (com consistência metodologia-evidência) → revisão com guarda de tamanho |
| **H: Finalização** | Quality gate, arquivamento de conhecimento, exportação LaTeX com template de conferência, verificação de integridade + relevância de citações |

</details>

---

## ✨ Funcionalidades Principais

| Funcionalidade | Descrição |
|---------|------------|
| **📚 Literatura Multi-Fonte** | Artigos reais do OpenAlex, Semantic Scholar & arXiv — expansão de consultas, deduplicação, circuit breaker com degradação graciosa |
| **🔍 Verificação de Citações em 4 Camadas** | Verificação de arXiv ID → CrossRef/DataCite DOI → correspondência de título no Semantic Scholar → pontuação de relevância por LLM. Referências alucinadas removidas automaticamente. |
| **🖥️ Execução com Consciência de Hardware** | Detecta automaticamente GPU (NVIDIA CUDA / Apple MPS / apenas CPU) e adapta geração de código, imports e escala de experimentos |
| **🦾 OpenCode Beast Mode** | Experimentos complexos roteados automaticamente para o [OpenCode](https://github.com/anomalyco/opencode) — gera projetos multi-arquivo com arquiteturas customizadas, loops de treinamento e estudos de ablação. Instale via `researchclaw setup`. |
| **🧪 Experimentos em Sandbox** | Código validado por AST, harness imutável, fast-fail para NaN/Inf, reparo auto-reparável, refinamento iterativo (até 10 rodadas), captura de resultados parciais |
| **📝 Escrita com Qualidade de Conferência** | Templates NeurIPS/ICML/ICLR, redação seção por seção (5.000-6.500 palavras), guarda anti-fabricação, guarda de tamanho na revisão, imposição anti-disclaimer |
| **📐 Troca de Template** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX com matemática, tabelas, figuras, referências cruzadas, `\cite{}` |
| **🚦 Quality Gates** | 3 gates com human-in-the-loop (Estágios 5, 9, 20) com rollback. Pule com `--auto-approve`. |

---

## 🧠 Integração MetaClaw

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Um pipeline que aprende com cada execução.**

MetaClaw adiciona **transferência de conhecimento entre execuções** ao AutoResearchClaw. Quando ativado, o pipeline captura automaticamente lições de falhas e avisos, converte-as em habilidades reutilizáveis e injeta essas habilidades em todos os 23 estágios do pipeline em execuções subsequentes — para que os mesmos erros nunca se repitam.

### Como Funciona

```
Run N executa → falhas/avisos capturados como Lessons
                      ↓
          MetaClaw Lesson → conversão em Skill
                      ↓
          Arquivos arc-* Skill armazenados em ~/.metaclaw/skills/
                      ↓
Run N+1 → build_overlay() injeta skills em cada prompt LLM
                      ↓
          LLM evita armadilhas conhecidas → maior qualidade, menos retentativas
```

### Configuração Rápida

```bash
# 1. Instale o MetaClaw (se ainda não tiver)
pip install metaclaw

# 2. Ative na sua configuração
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # Proxy MetaClaw (opcional)
  skills_dir: "~/.metaclaw/skills"          # Onde as skills são armazenadas
  fallback_url: "https://api.openai.com/v1" # Fallback direto para LLM
  fallback_api_key: ""                      # Chave de API para URL de fallback
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Converte warnings + errors
    max_skills_per_run: 3
```

```bash
# 3. Execute normalmente — MetaClaw funciona de forma transparente
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

Após cada execução, verifique `~/.metaclaw/skills/arc-*/SKILL.md` para ver as skills que seu pipeline aprendeu.

### Resultados dos Experimentos

Em experimentos A/B controlados (mesmo tópico, mesmo LLM, mesma configuração):

| Métrica | Baseline | Com MetaClaw | Melhoria |
|---------|----------|---------------|----------|
| Taxa de retentativa por estágio | 10.5% | 7.9% | **-24.8%** |
| Contagem de ciclos REFINE | 2.0 | 1.2 | **-40.0%** |
| Conclusão de estágios do pipeline | 18/19 | 19/19 | **+5.3%** |
| Pontuação de robustez geral (composta) | 0.714 | 0.845 | **+18.3%** |

> A pontuação composta de robustez é uma média ponderada da taxa de conclusão de estágios (40%), redução de retentativas (30%) e eficiência de ciclos REFINE (30%).

### Compatibilidade Retroativa

- **Padrão: DESATIVADO.** Se `metaclaw_bridge` estiver ausente ou `enabled: false`, o pipeline funciona exatamente como antes.
- **Sem novas dependências.** MetaClaw é opcional — o pipeline principal funciona sem ele.
- **Todos os 1.823 testes existentes passam** com o código de integração presente.

---

## ⚙️ Referência de Configuração

<details>
<summary>Clique para expandir a referência completa de configuração</summary>

```yaml
# === Projeto ===
project:
  name: "my-research"              # Identificador do projeto
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Pesquisa ===
research:
  topic: "..."                     # Tópico de pesquisa (obrigatório)
  domains: ["ml", "nlp"]           # Domínios de pesquisa para busca de literatura
  daily_paper_count: 8             # Artigos alvo por consulta de busca
  quality_threshold: 4.0           # Pontuação mínima de qualidade para artigos

# === Runtime ===
runtime:
  timezone: "America/New_York"     # Para timestamps
  max_parallel_tasks: 3            # Limite de experimentos concorrentes
  approval_timeout_hours: 12       # Timeout de estágios gate
  retry_limit: 2                   # Contagem de retentativas em falha de estágio

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # Endpoint da API (obrigatório para openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Variável de ambiente para chave da API (obrigatório para openai-compatible)
  api_key: ""                      # Ou insira a chave diretamente aqui
  primary_model: "gpt-4o"          # Modelo primário
  fallback_models: ["gpt-4o-mini"] # Cadeia de fallback
  s2_api_key: ""                   # Chave API do Semantic Scholar (opcional, limites de taxa maiores)
  acp:                             # Usado apenas quando provider: "acp"
    agent: "claude"                # Comando CLI do agente ACP (claude, codex, gemini, etc.)
    cwd: "."                       # Diretório de trabalho para o agente

# === Experimento ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Tempo máximo de execução por run (padrão: 300s)
  max_iterations: 10               # Máximo de iterações de otimização
  metric_key: "val_loss"           # Nome da métrica primária
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Detecção automática de imports → requirements.txt
  ssh_remote:
    host: ""                       # Hostname do servidor GPU
    gpu_ids: []                    # IDs de GPU disponíveis
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (auto-instalado via `researchclaw setup`)
    enabled: true                    # Interruptor principal (padrão: true)
    auto: true                       # Acionamento automático sem confirmação (padrão: true)
    complexity_threshold: 0.2        # 0.0-1.0 — maior = só aciona em experimentos complexos
    model: ""                        # Modelo override (vazio = usa llm.primary_model)
    timeout_sec: 600                 # Máximo de segundos para geração OpenCode
    max_retries: 1                   # Contagem de retentativas em falha
    workspace_cleanup: true          # Remove workspace temporário após coleta

# === Exportação ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Prompts ===
prompts:
  custom_file: ""                  # Caminho para YAML de prompts customizados (vazio = padrões)

# === Segurança ===
security:
  hitl_required_stages: [5, 9, 20] # Estágios que requerem aprovação humana
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === Base de Conhecimento ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Notificações ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === MetaClaw Bridge (Opcional) ===
metaclaw_bridge:
  enabled: false                   # Defina como true para ativar aprendizado entre execuções
  proxy_url: "http://localhost:30000"  # URL do proxy MetaClaw
  skills_dir: "~/.metaclaw/skills" # Onde as skills arc-* são armazenadas
  fallback_url: ""                 # Fallback direto para LLM quando o proxy está fora
  fallback_api_key: ""             # Chave de API para endpoint de fallback
  lesson_to_skill:
    enabled: true                  # Auto-converter lições em skills
    min_severity: "warning"        # Severidade mínima para converter
    max_skills_per_run: 3          # Máximo de novas skills por execução do pipeline

# === Bridge OpenClaw ===
openclaw_bridge:
  use_cron: false                  # Execuções de pesquisa agendadas
  use_message: false               # Notificações de progresso
  use_memory: false                # Persistência de conhecimento entre sessões
  use_sessions_spawn: false        # Criar sub-sessões paralelas
  use_web_fetch: false             # Busca web ao vivo
  use_browser: false               # Coleta de artigos baseada em navegador
```

</details>

---

## 🙏 Agradecimentos

Inspirado por:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pioneiro em pesquisa automatizada
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automação de pesquisa de ponta a ponta
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System

---

## 📄 Licença

MIT — veja [LICENSE](../LICENSE) para detalhes.

---

## 📌 Citação

Se você achar o AutoResearchClaw útil, por favor cite:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Construído com 🦞 pela equipe AutoResearchClaw</sub>
</p>


================================================
FILE: docs/README_RU.md
================================================
<p align="center">
  <img src="../image/logo.png" width="700" alt="AutoResearchClaw Logo">
</p>

<h2 align="center"><b>Напишите идею. Получите статью. Полностью автономно и с самообучением.</b></h2>

<p align="center">
  <b><i><font size="5">Просто напишите <a href="#-интеграция-с-openclaw">OpenClaw</a>: «Исследуй X» → готово.</font></i></b>
</p>

<p align="center">
  <img src="../image/framework_v2.png" width="100%" alt="AutoResearchClaw Framework">
</p>

<p align="center">
  <a href="../LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT License"></a>
  <a href="https://python.org"><img src="https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white" alt="Python 3.11+"></a>
  <a href="#тестирование"><img src="https://img.shields.io/badge/Tests-1634%20passed-brightgreen?logo=pytest&logoColor=white" alt="1634 Tests Passed"></a>
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/GitHub-AutoResearchClaw-181717?logo=github" alt="GitHub"></a>
  <a href="#-интеграция-с-openclaw"><img src="https://img.shields.io/badge/OpenClaw-Compatible-ff4444?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTEyIDJDNi40OCAyIDIgNi40OCAyIDEyczQuNDggMTAgMTAgMTAgMTAtNC40OCAxMC0xMFMxNy41MiAyIDEyIDJ6IiBmaWxsPSJ3aGl0ZSIvPjwvc3ZnPg==" alt="OpenClaw Compatible"></a>
  <a href="https://discord.gg/u4ksqW5P"><img src="https://img.shields.io/badge/Discord-Join%20Community-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
</p>

<p align="center">
  <a href="../README.md">🇺🇸 English</a> ·
  <a href="README_CN.md">🇨🇳 中文</a> ·
  <a href="README_JA.md">🇯🇵 日本語</a> ·
  <a href="README_KO.md">🇰🇷 한국어</a> ·
  <a href="README_FR.md">🇫🇷 Français</a> ·
  <a href="README_DE.md">🇩🇪 Deutsch</a> ·
  <a href="README_ES.md">🇪🇸 Español</a> ·
  <a href="README_PT.md">🇧🇷 Português</a> ·
  <a href="README_RU.md">🇷🇺 Русский</a> ·
  <a href="README_AR.md">🇸🇦 العربية</a>
</p>

<p align="center">
  <a href="showcase/SHOWCASE.md">🏆 Галерея статей</a> · <a href="integration-guide.md">📖 Руководство по интеграции</a> · <a href="https://discord.gg/u4ksqW5P">💬 Сообщество в Discord</a>
</p>

---

<table>
<tr>
<td width="18%">
<a href="showcase/SHOWCASE.md"><img src="showcase/thumbnails/paper_I_random_matrix-01.png" width="120" alt="Пример статьи"/></a>
</td>
<td valign="middle">
<b>🏆 Галерея сгенерированных статей</b><br><br>
<b>8 статей в 8 областях</b> — математика, статистика, биология, информатика, NLP, RL, компьютерное зрение, робастность — сгенерированы полностью автономно без участия человека.<br><br>
<a href="showcase/SHOWCASE.md"><img src="https://img.shields.io/badge/Посмотреть_галерею_→-Все_8_статей-d73a49?style=for-the-badge" alt="Посмотреть галерею"></a>
</td>
</tr>
</table>

---

> **🧪 Мы ищем тестировщиков!** Попробуйте запустить пайплайн со своей исследовательской идеей из любой области и [расскажите нам о результатах](TESTER_GUIDE.md). Ваш фидбек напрямую влияет на развитие проекта. **[→ Руководство по тестированию](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)**

---

## 🔥 Новости
- **[22.03.2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Кроссплатформенная поддержка + крупное обновление стабильности** — AutoResearchClaw теперь работает с любым ACP-совместимым агентом (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) и поддерживает мессенджеры (Discord, Telegram, Lark, WeChat) через мост OpenClaw. Новый CLI-agent бэкенд генерации кода делегирует Stage 10 и 13 внешним CLI-агентам с контролем бюджета и управлением таймаутами. Включает систему защиты от фабрикации (VerifiedRegistry + цикл диагностики и ремонта экспериментов), 100+ исправлений багов, модульный рефакторинг executor, автоопределение `--resume`, усиление повторов LLM и исправления от сообщества.
- **[18.03.2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Контрибьюты сообщества** — Новый режим "Beast Mode" перенаправляет сложную генерацию кода в [OpenCode](https://github.com/anomalyco/opencode) с автоматической оценкой сложности и безопасным фоллбэком. Добавлена поддержка провайдера Novita AI, улучшена потокобезопасность, повышена надежность парсинга ответов LLM, а также исправлено более 20 багов благодаря PR от сообщества и внутреннему аудиту.
- **[17.03.2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **Интеграция с MetaClaw** — AutoResearchClaw теперь поддерживает кросс-сессионное обучение через [MetaClaw](https://github.com/aiming-lab/MetaClaw): ошибки пайплайна → структурированные уроки → переиспользуемые навыки, которые внедряются во все 23 этапа. Робастность в контролируемых экспериментах выросла на **+18.3%**. Фича опциональна (`metaclaw_bridge.enabled: true`) и полностью обратно совместима. См. [Руководство по интеграции](#-интеграция-с-metaclaw).
- **[16.03.2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Три мультиагентные подсистемы (CodeAgent, BenchmarkAgent, FigureAgent), защищенная Docker-песочница с поддержкой сетевых политик, 4-этапный аудит качества статьи (поиск ИИ-галлюцинаций, оценка по 7 критериям, чек-лист NeurIPS) и более 15 исправлений багов с продакшена.
- **[15.03.2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — Релиз AutoResearchClaw: полностью автономный исследовательский пайплайн из 23 этапов, который превращает одну идею в готовую для конференции статью. Без вмешательства человека.

---

## ⚡ Одна команда. Одна статья.

```bash
pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Ваша исследовательская идея" --auto-approve
```

---

## 🤔 Что это такое?

**Вы придумываете. AutoResearchClaw пишет.**

Задайте тему исследования — и получите полноценную академическую статью с реальным обзором литературы из OpenAlex, Semantic Scholar и arXiv, экспериментами в песочнице с учетом вашего железа (автоопределение GPU/MPS/CPU), статистическим анализом, мультиагентным рецензированием и готовым LaTeX-кодом для конференций NeurIPS/ICML/ICLR. Никакого ручного контроля. Никакого копипаста. Никаких выдуманных ссылок.

<table>
<tr><td>📄</td><td><code>paper_draft.md</code></td><td>Полная академическая статья (Введение, Обзор литературы, Метод, Эксперименты, Результаты, Заключение)</td></tr>
<tr><td>📐</td><td><code>paper.tex</code></td><td>Готовый LaTeX-код (шаблоны NeurIPS / ICLR / ICML)</td></tr>
<tr><td>📚</td><td><code>references.bib</code></td><td>Реальные BibTeX-ссылки из OpenAlex, Semantic Scholar и arXiv — автоматически отфильтрованные под цитаты в тексте</td></tr>
<tr><td>🔍</td><td><code>verification_report.json</code></td><td>4-уровневая проверка целостности и релевантности цитирования (arXiv, CrossRef, DataCite, LLM)</td></tr>
<tr><td>🧪</td><td><code>experiment runs/</code></td><td>Сгенерированный код + результаты из песочницы + структурированные JSON-метрики</td></tr>
<tr><td>📊</td><td><code>charts/</code></td><td>Автоматически сгенерированные графики сравнения с планками погрешностей и доверительными интервалами</td></tr>
<tr><td>📝</td><td><code>reviews.md</code></td><td>Мультиагентное рецензирование с проверкой согласованности методологии и результатов</td></tr>
<tr><td>🧬</td><td><code>evolution/</code></td><td>Уроки для самообучения, извлеченные из каждого запуска</td></tr>
<tr><td>📦</td><td><code>deliverables/</code></td><td>Все итоговые материалы в одной папке — готовы к загрузке в Overleaf</td></tr>
</table>

Пайплайн работает **от начала до конца без вмешательства человека**. Если эксперименты падают — он чинит код. Если гипотезы не подтверждаются — он меняет направление. Если цитаты оказываются фейковыми — он их удаляет.

---

## 🚀 Быстрый старт

```bash
# 1. Клонируйте и установите
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw
python3 -m venv .venv && source .venv/bin/activate
pip install -e .

# 2. Настройка (интерактивная — устанавливает OpenCode beast mode, проверяет Docker/LaTeX)
researchclaw setup

# 3. Конфигурация
researchclaw init          # Интерактивный режим: выбор провайдера LLM, создание config.arc.yaml
# Или вручную: cp config.researchclaw.example.yaml config.arc.yaml

# 4. Запуск
export OPENAI_API_KEY="sk-..."
researchclaw run --config config.arc.yaml --topic "Ваша исследовательская идея" --auto-approve
```

Результаты → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — готовые к компиляции LaTeX, BibTeX, код экспериментов, графики.

<details>
<summary>📝 Минимальная конфигурация</summary>

```yaml
project:
  name: "my-research"

research:
  topic: "Ваша тема исследования"

llm:
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models: ["gpt-4o-mini"]

experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python"
```

</details>

---

## 🧠 В чем отличие от других

| Фича | Как это работает |
|-----------|-------------|
| **🔄 Цикл PIVOT / REFINE** | На 15-м этапе система автономно решает: ПРОДОЛЖИТЬ, УЛУЧШИТЬ (подобрать параметры) или СМЕНИТЬ КУРС (PIVOT). Артефакты версионируются автоматически. |
| **🤖 Мультиагентные дебаты** | Генерация гипотез, анализ результатов и рецензирование проходят в формате структурированных дебатов с разных точек зрения. |
| **🧬 Самообучение** | Из каждого запуска извлекаются уроки (обоснование решений, ошибки в коде, аномалии в метриках) с периодом полураспада в 30 дней. Будущие запуски учатся на прошлых ошибках. |
| **📚 База знаний** | Каждый запуск пополняет структурированную базу знаний по 6 категориями (решения, эксперименты, находки, литература, вопросы, рецензии). |
| **🛡️ Сторожевой модуль Sentinel** | Фоновый мониторинг качества: обнаружение NaN/Inf, проверка соответствия текста статьи реальным данным, оценка релевантности цитат, защита от фабрикации фактов. |

---

## 🦞 Интеграция с OpenClaw

<table>
<tr>

**AutoResearchClaw полностью совместим с [OpenClaw](https://github.com/openclaw/openclaw).** Установите его в OpenClaw и запускайте автономные исследования одним сообщением — или используйте отдельно через CLI, Claude Code или любой другой ИИ-ассистент.

</tr>
</table>

### 🚀 Использование с OpenClaw (Рекомендуется)

Если вы уже используете [OpenClaw](https://github.com/openclaw/openclaw) как своего ИИ-ассистента:

```
1️⃣  Отправьте URL репозитория в OpenClaw
2️⃣  OpenClaw автоматически прочитает RESEARCHCLAW_AGENTS.md → поймет структуру пайплайна
3️⃣  Напишите: "Проведи исследование на тему [ваша тема]"
4️⃣  Готово — OpenClaw сам склонирует, установит, настроит, запустит и вернет результаты
```

**Вот и всё.** OpenClaw берет на себя `git clone`, `pip install`, настройку конфигов и запуск пайплайна. Вы просто общаетесь в чате.

<details>
<summary>💡 Что происходит под капотом</summary>

1. OpenClaw читает `RESEARCHCLAW_AGENTS.md` → принимает на себя роль исследовательского оркестратора
2. OpenClaw читает `README.md` → понимает процесс установки и структуру пайплайна
3. OpenClaw копирует `config.researchclaw.example.yaml` → `config.yaml`
4. Запрашивает ваш API-ключ (или использует переменную окружения)
5. Выполняет `pip install -e .` + `researchclaw run --topic "..." --auto-approve`
6. Возвращает готовую статью, LaTeX, код экспериментов и список литературы

</details>

### 🔌 Мост OpenClaw (Продвинутый уровень)

Для более глубокой интеграции в AutoResearchClaw встроена **система адаптеров** с 6 опциональными возможностями:

```yaml
# config.arc.yaml
openclaw_bridge:
  use_cron: true              # ⏰ Запуск исследований по расписанию
  use_message: true           # 💬 Уведомления о прогрессе (Discord/Slack/Telegram)
  use_memory: true            # 🧠 Сохранение знаний между сессиями
  use_sessions_spawn: true    # 🔀 Запуск параллельных подсессий для независимых этапов
  use_web_fetch: true         # 🌐 Поиск в интернете в реальном времени при обзоре литературы
  use_browser: false          # 🖥️ Сбор статей через браузер
```

Каждый флаг активирует типизированный протокол адаптера. Если OpenClaw поддерживает эти функции, адаптеры используют их без изменения кода. Подробности см. в [`docs/integration-guide.md`](docs/integration-guide.md).

### ACP (Agent Client Protocol)

AutoResearchClaw может использовать **любого ACP-совместимого агента** в качестве LLM-бэкенда — API-ключи не требуются. Агент общается через [acpx](https://github.com/openclaw/acpx), поддерживая единую сессию на протяжении всех 23 этапов.

| Агент | Команда | Примечания |
|-------|---------|-------|
| Claude Code | `claude` | Anthropic |
| Codex CLI | `codex` | OpenAI |
| Copilot CLI | `gh` | GitHub |
| Gemini CLI | `gemini` | Google |
| OpenCode | `opencode` | SST |
| Kimi CLI | `kimi` | Moonshot |

```yaml
# config.yaml — пример ACP
llm:
  provider: "acp"
  acp:
    agent: "claude"   # Любая команда CLI ACP-совместимого агента
    cwd: "."          # Рабочая директория для агента
  # base_url и api_key не нужны — агент сам управляет авторизацией.
```

```bash
# Просто запускайте — агент использует свои собственные учетные данные
researchclaw run --config config.yaml --topic "Ваша идея" --auto-approve
```

### 🛠️ Другие способы запуска

| Способ | Как запустить |
|--------|-----|
| **CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` |
| **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` |
| **Claude Code** | Читает `RESEARCHCLAW_CLAUDE.md` — просто напишите *"Run research on [topic]"* |
| **Copilot CLI** | `researchclaw run --topic "..."` с `llm.acp.agent: "gh"` |
| **OpenCode** | Читает `.claude/skills/` — такой же интерфейс на естественном языке |
| **Любой AI CLI** | Скормите `RESEARCHCLAW_AGENTS.md` в контекст → агент сам поймет, что делать |

---

## 🔬 Пайплайн: 23 этапа, 8 фаз

```
Фаза A: Определение области          Фаза E: Выполнение экспериментов
  1. TOPIC_INIT                         12. EXPERIMENT_RUN
  2. PROBLEM_DECOMPOSE                  13. ITERATIVE_REFINE  ← самовосстановление

Фаза B: Поиск литературы             Фаза F: Анализ и принятие решений
  3. SEARCH_STRATEGY                    14. RESULT_ANALYSIS    ← мультиагентный анализ
  4. LITERATURE_COLLECT  ← API          15. RESEARCH_DECISION  ← PIVOT/REFINE
  5. LITERATURE_SCREEN   [гейт]
  6. KNOWLEDGE_EXTRACT                  Фаза G: Написание статьи
                                        16. PAPER_OUTLINE
Фаза C: Синтез знаний                   17. PAPER_DRAFT
  7. SYNTHESIS                          18. PEER_REVIEW        ← проверка доказательств
  8. HYPOTHESIS_GEN    ← дебаты         19. PAPER_REVISION

Фаза D: Дизайн экспериментов         Фаза H: Финализация
  9. EXPERIMENT_DESIGN   [гейт]         20. QUALITY_GATE      [гейт]
 10. CODE_GENERATION                    21. KNOWLEDGE_ARCHIVE
 11. RESOURCE_PLANNING                  22. EXPORT_PUBLISH     ← LaTeX
                                        23. CITATION_VERIFY    ← проверка релевантности
```

> **Гейты (Контрольные точки)** (5, 9, 20) ставят пайплайн на паузу для апрува человеком (или пропускаются флагом `--auto-approve`). При отклонении пайплайн откатывается назад.

> **Циклы принятия решений**: На 15-м этапе система может уйти на доработку (REFINE → Этап 13) или сменить курс (PIVOT → Этап 8), автоматически сохраняя версии артефактов.

<details>
<summary>📋 Что происходит на каждой фазе</summary>

| Фаза | Описание |
|-------|-------------|
| **A: Определение области** | LLM разбивает тему на структурированное дерево проблем с исследовательскими вопросами. |
| **A+: Железо** | Автоопределение GPU (NVIDIA CUDA / Apple MPS / CPU), предупреждения о нехватке ресурсов, адаптация генерации кода под доступное железо. |
| **B: Литература** | Поиск по нескольким базам (OpenAlex → Semantic Scholar → arXiv) реальных статей, фильтрация по релевантности, извлечение карточек знаний. |
| **C: Синтез** | Кластеризация находок, поиск пробелов в исследованиях, генерация проверяемых гипотез через мультиагентные дебаты. |
| **D: Дизайн** | Проектирование плана экспериментов, генерация Python-кода с учетом железа (выбор пакетов под GPU), оценка требуемых ресурсов. |
| **E: Выполнение** | Запуск экспериментов в песочнице, отлов NaN/Inf и багов в рантайме, самовосстановление кода через LLM. |
| **F: Анализ** | Мультиагентный анализ результатов; автономное решение ПРОДОЛЖИТЬ / УЛУЧШИТЬ / СМЕНИТЬ КУРС с подробным обоснованием. |
| **G: Написание** | План → написание по разделам (5,000-6,500 слов) → рецензирование (с проверкой соответствия методологии и результатов) → редактура с контролем объема. |
| **H: Финализация** | Контроль качества, архивация знаний, экспорт в LaTeX по шаблонам конференций, проверка целостности и релевантности цитат. |

</details>

---

## ✨ Ключевые фичи

| Фича | Описание |
|---------|------------|
| **📚 Мультиисточниковая литература** | Реальные статьи из OpenAlex, Semantic Scholar и arXiv — расширение запросов, дедупликация, защита от падений API с постепенной деградацией. |
| **🔍 4-уровневая проверка цитат** | Проверка arXiv ID → CrossRef/DataCite DOI → совпадение заголовков в Semantic Scholar → оценка релевантности через LLM. Выдуманные ссылки удаляются автоматически. |
| **🖥️ Адаптация под железо** | Автоопределение GPU (NVIDIA CUDA / Apple MPS / CPU) и адаптация генерации кода, импортов и масштаба экспериментов. |
| **🦾 OpenCode Beast Mode** | Сложные эксперименты автоматически перенаправляются в [OpenCode](https://github.com/anomalyco/opencode) — генерация многофайловых проектов с кастомными архитектурами, циклами обучения и ablation studies. Устанавливается через `researchclaw setup`. |
| **🧪 Эксперименты в песочнице** | Валидация кода через AST, неизменяемая обвязка, быстрый отказ при NaN/Inf, самовосстановление, итеративное улучшение (до 10 раундов), сохранение частичных результатов. |
| **📝 Написание уровня конференций** | Шаблоны NeurIPS/ICML/ICLR, написание по разделам (5,000-6,500 слов), защита от выдуманных фактов, контроль объема при редактуре, удаление типичных ИИ-оговорок. |
| **📐 Переключение шаблонов** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX с формулами, таблицами, графиками, перекрестными ссылками и `\cite{}`. |
| **🚦 Гейты качества** | 3 точки контроля человеком (Этапы 5, 9, 20) с возможностью отката. Можно пропустить флагом `--auto-approve`. |

---

## 🧠 Интеграция с MetaClaw

**AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Пайплайн, который учится на каждом запуске.**

MetaClaw добавляет **перенос знаний между запусками**. Если эта функция включена, пайплайн автоматически извлекает уроки из ошибок и предупреждений, превращает их в переиспользуемые навыки и внедряет во все 23 этапа при следующих запусках — чтобы больше никогда не повторять одни и те же ошибки.

### Как это работает

```
Запуск N выполняется → ошибки/предупреждения сохраняются как Уроки (Lessons)
                      ↓
          MetaClaw конвертирует Урок → Навык (Skill)
                      ↓
          Файлы навыков arc-* сохраняются в ~/.metaclaw/skills/
                      ↓
Запуск N+1 → build_overlay() внедряет навыки в каждый промпт LLM
                      ↓
          LLM избегает известных ошибок → выше качество, меньше ретраев
```

### Быстрая настройка

```bash
# 1. Установите MetaClaw (если еще не установлен)
pip install metaclaw

# 2. Включите в конфиге
```

```yaml
# config.arc.yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"        # Прокси MetaClaw (опционально)
  skills_dir: "~/.metaclaw/skills"          # Папка для хранения навыков
  fallback_url: "https://api.openai.com/v1" # Прямой фоллбэк к LLM
  fallback_api_key: ""                      # API-ключ для фоллбэка
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Конвертировать предупреждения и ошибки
    max_skills_per_run: 3
```

```bash
# 3. Запускайте как обычно — MetaClaw работает прозрачно
researchclaw run --config config.arc.yaml --topic "Ваша идея" --auto-approve
```

После каждого запуска заглядывайте в `~/.metaclaw/skills/arc-*/SKILL.md`, чтобы посмотреть, чему научился ваш пайплайн.

### Результаты экспериментов

В контролируемых A/B тестах (одна тема, одна LLM, один конфиг):

| Метрика | База | С MetaClaw | Улучшение |
|--------|----------|---------------|-------------|
| Частота ретраев на этапах | 10.5% | 7.9% | **-24.8%** |
| Количество циклов доработки (Refine) | 2.0 | 1.2 | **-40.0%** |
| Успешное завершение пайплайна | 18/19 | 19/19 | **+5.3%** |
| Общий индекс робастности (композитный) | 0.714 | 0.845 | **+18.3%** |

> Композитный индекс робастности — это взвешенное среднее из процента завершения (40%), снижения ретраев (30%) и эффективности циклов доработки (30%).

### Обратная совместимость

- **По умолчанию: ВЫКЛЮЧЕНО.** Если блока `metaclaw_bridge` нет или `enabled: false`, пайплайн работает как раньше.
- **Никаких новых зависимостей.** MetaClaw опционален — ядро работает и без него.
- **Все 1 935 тестов проходят успешно** даже с кодом интеграции.

---

## ⚙️ Справочник по конфигурации

<details>
<summary>Нажмите, чтобы развернуть полный конфиг</summary>

```yaml
# === Проект ===
project:
  name: "my-research"              # Идентификатор проекта
  mode: "docs-first"               # docs-first | semi-auto | full-auto

# === Исследование ===
research:
  topic: "..."                     # Тема исследования (обязательно)
  domains: ["ml", "nlp"]           # Домены для поиска литературы
  daily_paper_count: 8             # Целевое количество статей на один запрос
  quality_threshold: 4.0           # Минимальный порог качества для статей

# === Рантайм ===
runtime:
  timezone: "Europe/Moscow"        # Для таймстемпов
  max_parallel_tasks: 3            # Лимит параллельных экспериментов
  approval_timeout_hours: 12       # Таймаут ожидания на гейтах
  retry_limit: 2                   # Количество ретраев при падении этапа

# === LLM ===
llm:
  provider: "openai-compatible"    # openai | openrouter | deepseek | minimax | acp | openai-compatible
  base_url: "https://..."          # API endpoint (обязательно для openai-compatible)
  api_key_env: "OPENAI_API_KEY"    # Переменная окружения с ключом (обязательно для openai-compatible)
  api_key: ""                      # Или можно захардкодить ключ здесь
  primary_model: "gpt-4o"          # Основная модель
  fallback_models: ["gpt-4o-mini"] # Цепочка фоллбэков
  s2_api_key: ""                   # API-ключ Semantic Scholar (опционально, дает лимиты выше)
  acp:                             # Используется только если provider: "acp"
    agent: "claude"                # Команда CLI ACP-агента (claude, codex, gemini и т.д.)
    cwd: "."                       # Рабочая директория агента

# === Эксперименты ===
experiment:
  mode: "sandbox"                  # simulated | sandbox | docker | ssh_remote
  time_budget_sec: 300             # Макс. время на один запуск (по умолчанию: 300с)
  max_iterations: 10               # Макс. количество итераций оптимизации
  metric_key: "val_loss"           # Название главной метрики
  metric_direction: "minimize"     # minimize | maximize
  sandbox:
    python_path: ".venv/bin/python"
    gpu_required: false
    allowed_imports: [math, random, json, csv, numpy, torch, sklearn]
    max_memory_mb: 4096
  docker:
    image: "researchclaw/experiment:latest"
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    gpu_enabled: true
    memory_limit_mb: 8192
    auto_install_deps: true        # Автоопределение импортов → requirements.txt
  ssh_remote:
    host: ""                       # Хостнейм GPU-сервера
    gpu_ids: []                    # Доступные ID видеокарт
    remote_workdir: "/tmp/researchclaw_experiments"
  opencode:                          # OpenCode Beast Mode (устанавливается через `researchclaw setup`)
    enabled: true                    # Главный рубильник (по умолчанию: true)
    auto: true                       # Автозапуск без подтверждения (по умолчанию: true)
    complexity_threshold: 0.2        # 0.0-1.0 — чем выше, тем реже триггерится (только на сложных задачах)
    model: ""                        # Переопределение модели (пусто = использовать llm.primary_model)
    timeout_sec: 600                 # Макс. время на генерацию в OpenCode
    max_retries: 1                   # Количество ретраев при падении
    workspace_cleanup: true          # Удалять временный воркспейс после сбора результатов

# === Экспорт ===
export:
  target_conference: "neurips_2025"  # neurips_2025 | iclr_2026 | icml_2026
  authors: "Anonymous"
  bib_file: "references"

# === Промпты ===
prompts:
  custom_file: ""                  # Путь к кастомному YAML с промптами (пусто = дефолтные)

# === Безопасность ===
security:
  hitl_required_stages: [5, 9, 20] # Этапы, требующие апрува человеком (Human-in-the-loop)
  allow_publish_without_approval: false
  redact_sensitive_logs: true

# === База знаний ===
knowledge_base:
  backend: "markdown"              # markdown | obsidian
  root: "docs/kb"

# === Уведомления ===
notifications:
  channel: "console"               # console | discord | slack
  target: ""

# === Мост MetaClaw (Опционально) ===
metaclaw_bridge:
  enabled: false                   # Включить кросс-сессионное обучение
  proxy_url: "http://localhost:30000"  # URL прокси MetaClaw
  skills_dir: "~/.metaclaw/skills" # Папка для хранения навыков arc-*
  fallback_url: ""                 # Прямой фоллбэк к LLM, если прокси лежит
  fallback_api_key: ""             # API-ключ для фоллбэка
  lesson_to_skill:
    enabled: true                  # Автоматически конвертировать уроки в навыки
    min_severity: "warning"        # Минимальная серьезность для конвертации
    max_skills_per_run: 3          # Макс. количество новых навыков за один запуск

# === Мост OpenClaw ===
openclaw_bridge:
  use_cron: false                  # Запуск исследований по расписанию
  use_message: false               # Уведомления о прогрессе
  use_memory: false                # Сохранение знаний между сессиями
  use_sessions_spawn: false        # Запуск параллельных подсессий
  use_web_fetch: false             # Поиск в интернете в реальном времени
  use_browser: false               # Сбор статей через браузер
```

</details>

---

## 🙏 Благодарности

Вдохновлено проектами:

- 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Пионер автоматизированных исследований
- 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Сквозная автоматизация исследований
- 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Полностью автоматизированная исследовательская система

---

## 📄 Лицензия

MIT — подробности см. в [LICENSE](../LICENSE).

---

## 📌 Цитирование

Если AutoResearchClaw оказался вам полезен, пожалуйста, процитируйте:

```bibtex
@misc{liu2026autoresearchclaw,
  author       = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming  and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu},
  title        = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper},
  year         = {2026},
  organization = {GitHub},
  url          = {https://github.com/aiming-lab/AutoResearchClaw},
}
```

<p align="center">
  <sub>Создано с 🦞 командой AutoResearchClaw</sub>
</p>

================================================
FILE: docs/TESTER_GUIDE.md
================================================
<p align="center">
  <img src="../image/logo.png" width="500" alt="AutoResearchClaw Logo">
</p>

<h2 align="center">🧪 Community Testing Guide</h2>

<p align="center">
  <b>Help us stress-test the world's first fully autonomous research pipeline — across every domain.</b>
</p>

<p align="center">
  <a href="https://github.com/aiming-lab/AutoResearchClaw">⭐ Star the Repo</a> ·
  <a href="#-quick-start">🚀 Quick Start</a> ·
  <a href="#-feedback-template">📋 Feedback Template</a> ·
  <a href="TESTER_GUIDE_CN.md">🇨🇳 中文测试指南</a> ·
  <a href="TESTER_GUIDE_JA.md">🇯🇵 日本語テストガイド</a>
</p>

---

## 👋 Welcome, Tester!

**AutoResearchClaw** is a fully autonomous academic paper generation pipeline. You give it a research idea — it handles everything else: literature search, experiment design, code generation, experiment execution, paper writing, peer review, and final delivery. **23 stages, zero human intervention.**

We're looking for testers from **all disciplines and backgrounds** — machine learning, NLP, computer vision, reinforcement learning, bioinformatics, physics, social sciences, and beyond. The more diverse the testing, the better the pipeline becomes.

**Your mission:** Run the pipeline with your own research idea, inspect the output, and submit a detailed feedback report. That's it. Every piece of feedback directly shapes the next version.

---

## 📋 Table of Contents

1. [Prerequisites](#-prerequisites)
2. [Installation & Setup](#-installation--setup)
3. [Running the Pipeline](#-running-the-pipeline)
4. [Inspecting the Output](#-inspecting-the-output)
5. [Feedback Report Requirements](#-feedback-report-requirements)
6. [Feedback Template](#-feedback-template)
7. [FAQ](#-faq)

---

## 📦 Prerequisites

| Item | Minimum | Recommended |
|------|---------|-------------|
| OS | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) |
| Python | 3.11+ | 3.11 or 3.12 |
| Disk | 500 MB | 2 GB+ |
| RAM | 8 GB | 16 GB+ |
| GPU | Not required (sandbox mode) | NVIDIA GPU + CUDA 12.x (docker mode) |
| Network | Required (LLM API + literature search) | Stable connection |
| LLM API Key | **Required** | OpenAI or Anthropic |

### 🔑 About API Keys

The pipeline calls a large language model (LLM) at every stage — writing, coding, reviewing, and more. You'll need an API key from **OpenAI** or **Anthropic**.

> **We strongly recommend using the most capable models available for the best results:**
>
> | Provider | Recommended Model | Fallback |
> |----------|------------------|----------|
> | **OpenAI** | **GPT-5.4** (best) | GPT-5.1 or GPT-4.1 |
> | **Anthropic** | **Claude Opus 4.6** (best) | Claude Sonnet 4.6 |
>
> Using a top-tier model significantly improves paper quality, code correctness, and experiment design. Older models (e.g., GPT-4o) may produce noticeably weaker output.

---

## 🛠 Installation & Setup

### ⚠️ Always Use the Latest Version

> **This project is under active development.** The codebase is updated frequently, and different versions can produce significantly different results.
>
> **Before every test run, always pull the latest code:**
>
> ```bash
> cd AutoResearchClaw
> git pull origin main
> pip install -e .    # Re-install to pick up changes
> ```
>
> Record your version for the feedback report:
> ```bash
> git log --oneline -1
> ```

---

### Option A: Claude Code (Fastest — Recommended ⚡)

If you have [Claude Code](https://claude.ai/claude-code) (Anthropic's CLI tool), just paste this:

```
Please clone and install AutoResearchClaw:
https://github.com/aiming-lab/AutoResearchClaw.git

If already cloned, run git pull origin main to update to the latest version first.

Then create a config file with:
- LLM: OpenAI with gpt-5.4 (or Anthropic Claude Opus 4.6)
- Experiment mode: sandbox (local execution)
- Research topic: "<YOUR RESEARCH IDEA HERE>"
- Auto-approve all gate stages

My API key is: sk-xxxx (set it as an environment variable, don't hardcode it)
```

Claude Code will handle cloning, dependencies, configuration, and execution automatically.

### Option B: Manual Installation

```bash
# 1. Clone the repo
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw

# 2. Create a virtual environment
python3 -m venv .venv
source .venv/bin/activate       # macOS / Linux
# .venv\Scripts\activate        # Windows (prefer WSL2)

# 3. Install
pip install -e .

# 4. Verify
researchclaw --help
```

### ⚙️ Configuration

```bash
cp config.researchclaw.example.yaml config.yaml
```

Edit `config.yaml` — here are the key fields:

```yaml
# === Project ===
project:
  name: "my-test"
  mode: "full-auto"

# === Research Topic — describe your idea in English ===
research:
  topic: "Your research idea in 1-2 sentences"
  domains:
    - "machine-learning"     # Options: nlp, cv, rl, graph-learning, etc.

# === LLM — use the strongest model you have access to! ===
#
# Option 1: OpenAI (GPT-5.4 recommended)
llm:
  provider: "openai-compatible"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-5.4"              # Best available
  fallback_models:
    - "gpt-5.1"
    - "gpt-4.1"

# Option 2: Anthropic Claude (Claude Opus 4.6 recommended)
# llm:
#   provider: "openai-compatible"
#   base_url: "https://api.anthropic.com/v1"
#   api_key_env: "ANTHROPIC_API_KEY"
#   primary_model: "claude-opus-4-6"
#   fallback_models:
#     - "claude-sonnet-4-6"

# === Experiment ===
experiment:
  mode: "sandbox"                # sandbox = local execution (recommended)
  time_budget_sec: 600           # Max seconds per experiment run
  max_iterations: 10
  metric_key: "primary_metric"
  metric_direction: "minimize"   # or "maximize"
```

### 🔐 Set Your API Key

```bash
# OpenAI users:
export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx"

# Anthropic users:
export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx"

# Optional: Semantic Scholar API key (speeds up literature search)
export S2_API_KEY="your-s2-key"
```

> **🔒 Security:** Never hardcode API keys in files. Use `api_key_env` in the config to reference an environment variable.

---

## 🚀 Running the Pipeline

### Quick Start

```bash
source .venv/bin/activate
export OPENAI_API_KEY="sk-xxxx"       # or ANTHROPIC_API_KEY

researchclaw run --config config.yaml --auto-approve
```

### With a Specific Topic

```bash
researchclaw run \
  --config config.yaml \
  --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \
  --auto-approve
```

### ⏱ Expected Runtime

| Mode | Estimated Time | Notes |
|------|---------------|-------|
| sandbox | 30 min – 2 hours | Depends on experiment complexity & API speed |
| docker (GPU) | 1 – 4 hours | For heavier deep learning experiments |

The terminal shows real-time progress. **No manual intervention needed** — sit back and let it run.

### ✅ How to Know It's Done

You'll see output like:

```
[Stage 23/23] ✓ Deliverables packaged
Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/
```

### 🔄 If It Gets Interrupted

The pipeline supports checkpointing — just resume:

```bash
researchclaw run --config config.yaml --resume
```

---

## 🔍 Inspecting the Output

After completion, find your results in `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/`.

### 📂 Deliverables

| File / Directory | Description |
|-----------------|-------------|
| `paper_final.md` | Final paper in Markdown (5,000–6,500 words) |
| `paper.tex` | Conference-ready LaTeX source (directly compilable) |
| `references.bib` | BibTeX bibliography (verified citations) |
| `code/main.py` | Auto-generated experiment code |
| `code/requirements.txt` | Python dependencies for experiments |
| `charts/` | Result visualization charts (PNG) |
| `verification_report.json` | Citation integrity verification report |
| `manifest.json` | Deliverable manifest with metadata |

### 🔎 What to Check

1. **Paper Content** (`paper_final.md` or `paper.tex`)
   - Is the title relevant to the topic?
   - Does the abstract clearly state problem, method, and results?
   - Does Related Work cite key papers in the field?
   - Is the method description technically correct?
   - Is the experiment design sound (datasets, baselines, metrics)?
   - Are results meaningful (not all zeros, not NaN)?
   - Are conclusions consistent with experimental findings?

2. **Experiment Code** (`code/main.py`)
   - Can it run independently?
   - Does it use real datasets (not randomly generated fake data)?
   - Does it implement what the paper describes?
   - Are hyperparameters reasonable?

3. **Charts** (`charts/`)
   - Are they readable and clean?
   - Are axis labels correct?
   - Does the data match the paper's claims?

4. **References** (`references.bib`)
   - Do the cited papers actually exist?
   - Are citations relevant to the discussion?

### 📊 Auto-Generated Quality Report

The pipeline produces a quality assessment at `stage-20/quality_report.json` containing:

- `score_1_to_10` — automated quality score
- `verdict` — accept / reject recommendation
- `strengths` — what went well
- `weaknesses` — identified issues
- `required_actions` — suggested improvements

Please reference this in your feedback, and add your own expert judgment.

---

## 📝 Feedback Report Requirements

**Your feedback is the single most important input for improving this project.** Please be thorough and honest — critical feedback is just as valuable as praise.

### What to Submit

| # | Item | Details |
|---|------|---------|
| F1 | **Feedback Report** (use template below) | Markdown format, named `feedback_<your-name>.md` |
| F2 | **Full Output Directory** | Zip the entire `artifacts/rc-XXXXXX/` directory |
| F3 | **Config File** | Your `config.yaml` (**remove API keys first!**) |
| F4 | **Terminal Log** (optional but helpful) | Copy of the terminal output during the run |

### The Four Dimensions of Feedback

#### 🎯 (a) Quality Assessment

From your domain expertise:

- If this were a paper in your field, what level would it reach? (top venue / mid-tier / workshop / unpublishable)
- How does the writing compare to papers you normally read?
- Is the method technically correct? Any obvious errors?
- Is the experiment design reasonable?

#### 💡 (b) Improvement Suggestions

- Which stage produced the weakest output? (literature search / experiment design / code generation / paper writing)
- Any obvious code errors or poor design choices?
- Specific suggestions for improving the paper structure or writing?

#### ⚖️ (c) Pipeline Design Assessment

- Are the 23 stages well-designed? Any redundant or missing steps?
- Is the iterative experiment refinement effective?
- Is the LLM guidance at each stage appropriate?

#### 🐛 (d) Bug Reports

Please report any issues you find, as specifically as possible:

- **Writing bugs:** grammar errors, repeated paragraphs, contradictions, references to non-existent figures
- **Code bugs:** runtime errors, logic errors, data handling issues
- **Result bugs:** all-zero results, NaN values, unreasonable metrics
- **Pipeline bugs:** stages getting stuck, unexpected crashes, resource exhaustion

---

## 📋 Feedback Template

Copy the template below, fill it out, and save as `feedback_<your-name>.md`:

````markdown
# AutoResearchClaw — Test Feedback Report

## Basic Information

- **Tester Name:**
- **Domain / Field:** (e.g., Computer Vision / NLP / Reinforcement Learning / Bioinformatics / ...)
- **Test Date:**
- **Code Version:** (output of `git log --oneline -1`, e.g., `44151b1 fix: Phase 3 regression test findings`)
- **Research Topic (English):**
- **LLM Model Used:** (e.g., gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6)
- **Experiment Mode:** (sandbox / docker)
- **Total Runtime:** (~X minutes)
- **Completed All 23 Stages?:** Yes / No (if No, which stage failed?)

---

## 1. Quality Assessment (Score: 1–10)

**My Score:** X / 10

### 1.1 Overall Paper Quality
- What level paper does this correspond to? (top venue / mid-tier / workshop / unpublishable)
- Reason for score:

### 1.2 Section-by-Section Assessment

| Section | Score (1-10) | Comments |
|---------|-------------|----------|
| Title | | |
| Abstract | | |
| Introduction | | |
| Related Work | | |
| Method | | |
| Experiment Design | | |
| Results & Analysis | | |
| Conclusion | | |
| References | | |
| Charts / Figures | | |
| Code Quality | | |

### 1.3 Comparison with Human-Written Papers
- Compared to papers you normally read/write, where are the gaps?
- Anything surprisingly good?

---

## 2. Improvement Suggestions

### 2.1 Top Issues (list 3-5, in priority order)

1.
2.
3.

### 2.2 Code Issues
- Can the code run independently?
- Does it use real datasets and baselines?
- Specific code issues (if any):

### 2.3 Writing Issues
- Is the paper structure reasonable?
- Is the technical description accurate?
- Specific writing issues (if any):

---

## 3. Pipeline Design Assessment

### 3.1 Pipeline Flow
- Is the 23-stage design reasonable?
- Any redundant or missing steps?

### 3.2 Experiment Execution
- Is the experiment design sound? (dataset choices, comparison methods, metrics)
- Is the iterative refinement effective?

### 3.3 LLM Usage
- How well did the LLM perform at each stage?
- Any obvious "hallucinations" or unreasonable outputs?

---

## 4. Bug Reports

### 4.1 Writing Bugs
| # | Location (section/paragraph) | Description | Severity (High/Med/Low) |
|---|------------------------------|-------------|------------------------|
| W1 | | | |
| W2 | | | |

### 4.2 Code Bugs
| # | File / Line | Description | Severity (High/Med/Low) |
|---|-------------|-------------|------------------------|
| C1 | | | |
| C2 | | | |

### 4.3 Result Bugs
| # | Description | Affected Metrics/Charts | Severity (High/Med/Low) |
|---|-------------|------------------------|------------------------|
| R1 | | | |
| R2 | | | |

### 4.4 Pipeline Bugs
| # | Stage | Description | Severity (High/Med/Low) |
|---|-------|-------------|------------------------|
| P1 | | | |
| P2 | | | |

---

## 5. Additional Comments

(Free-form: any observations, ideas, or suggestions you think would be valuable)

---

## Attachments Checklist

- [ ] Feedback report (`feedback_<name>.md`)
- [ ] Full output directory (`artifacts/rc-XXXXXX.zip`)
- [ ] Config file (`config.yaml`, API keys removed)
- [ ] Terminal log (optional)
````

---

## ❓ FAQ

### Q1: Can I test without a GPU?

**Yes!** Use `experiment.mode: "sandbox"` — the pipeline runs experiments on your CPU. The experiments will be simpler, but still enough for a full end-to-end test.

### Q2: How much does an API call cost?

A full pipeline run costs roughly **$5–15** in API fees, depending on the model, number of revision iterations, and experiment complexity. Top-tier models (GPT-5.4, Claude Opus 4.6) cost a bit more but produce significantly better results.

### Q3: What if the pipeline crashes mid-run?

Resume from the checkpoint:

```bash
researchclaw run --config config.yaml --resume
```

### Q4: Can I use a non-English research topic?

We recommend describing your topic in **English**. The pipeline's prompts, literature search, and paper generation are all English-based. If your idea is originally in another language, please translate it first.

### Q5: What kind of research topic should I pick?

Choose a **specific research question in a field you know well** — that way you can meaningfully assess whether the output is technically correct. Tips:

- ✅ Pick topics with clear experimental validation (classification, regression, RL tasks, etc.)
- ❌ Avoid overly broad or abstract topics (e.g., "AGI", "general intelligence")
- ✅ Be specific: *"Investigating the effect of data augmentation strategies on few-shot learning for medical image classification"*

### Q6: How do I use Docker mode? (Advanced)

If you have an NVIDIA GPU with Docker + NVIDIA Container Toolkit:

```bash
# 1. Build the experiment image
docker build -t researchclaw/experiment:latest researchclaw/docker/

# 2. Update config.yaml:
#   experiment:
#     mode: "docker"
#     docker:
#       gpu_enabled: true
#       memory_limit_mb: 8192
#       network_policy: "setup_only"  # recommended default

# 3. Run
researchclaw run --config config.yaml --auto-approve
```

Docker mode uses a three-phase execution model: pip install (network on) → setup.py (network on) → experiment (network off). The image includes pre-cached datasets (CIFAR-10/100, MNIST, FashionMNIST, STL-10, SVHN) so standard benchmarks work without network access.

### Q7: I tested before — what should I do for a re-test?

**Always pull the latest code** before each test:

```bash
cd AutoResearchClaw
git pull origin main
pip install -e .
```

Then verify your version:

```bash
git log --oneline -1
```

Different versions can produce very different results. Always note the commit hash in your feedback report.

### Q8: Where do I submit my feedback?

Submit your feedback report and attachments through one of these channels:

- **GitHub Issues:** [Open an issue](https://github.com/aiming-lab/AutoResearchClaw/issues) with the label `feedback`
- **Pull Request:** Submit your `feedback_<name>.md` to the `community-feedback/` directory
- **Email:** Contact the project maintainers (see repo for details)

---

## 🌍 We Need Testers from Every Field

The pipeline has been tested primarily on ML topics so far. We especially welcome testers from:

- 🧬 **Bioinformatics & Computational Biology**
- 🧪 **Chemistry & Materials Science**
- 📊 **Statistics & Applied Mathematics**
- 🤖 **Robotics & Control Systems**
- 🗣️ **NLP & Computational Linguistics**
- 👁️ **Computer Vision & Graphics**
- 🎮 **Reinforcement Learning & Game Theory**
- 🏥 **Medical AI & Healthcare**
- 🌐 **Graph Learning & Network Science**
- 💹 **Financial ML & Econometrics**
- 🛰️ **Remote Sensing & Geospatial AI**

...and any other field where computational experiments are involved!

---

## 🙏 Thank You

Every piece of feedback — big or small — directly improves AutoResearchClaw. Thank you for being part of this journey.

<p align="center">
  <b>⭐ If you find this project interesting, please give us a star on <a href="https://github.com/aiming-lab/AutoResearchClaw">GitHub</a>!</b>
</p>


================================================
FILE: docs/TESTER_GUIDE_CN.md
================================================
<p align="center">
  <img src="../image/logo.png" width="500" alt="AutoResearchClaw Logo">
</p>

<h2 align="center">🧪 社区测试指南</h2>

<p align="center">
  <b>欢迎来自各个领域的你，一起测试全球首个全自动学术论文生成 Pipeline。</b>
</p>

<p align="center">
  <a href="https://github.com/aiming-lab/AutoResearchClaw">⭐ Star 项目</a> ·
  <a href="#-快速开始">🚀 快速开始</a> ·
  <a href="#-反馈报告模板">📋 反馈模板</a> ·
  <a href="TESTER_GUIDE.md">🇬🇧 English</a> ·
  <a href="TESTER_GUIDE_JA.md">🇯🇵 日本語テストガイド</a>
</p>

---

## 👋 你好，测试者！

**AutoResearchClaw** 是一个全自动学术论文生成 Pipeline。你只需提供一个研究 idea，系统就会自动完成文献检索、实验设计、代码生成、实验执行、论文撰写、同行评审到最终交付的全部 **23 个阶段**——无需任何人工干预。

我们正在寻找来自**各个学科和领域**的测试者——机器学习、NLP、计算机视觉、强化学习、生物信息学、物理学、社会科学……领域越多样，Pipeline 就能变得越好。

**你的任务：** 用你自己的研究 idea 运行一次完整的 Pipeline，检查输出质量，然后向我们提交一份详细的反馈报告。就这么简单——你的每一条反馈都会直接推动下一个版本的改进。

---

## 📋 目录

1. [环境要求](#-环境要求)
2. [安装与配置](#-安装与配置)
3. [运行测试](#-运行测试)
4. [查看交付结果](#-查看交付结果)
5. [反馈报告要求](#-反馈报告要求)
6. [反馈报告模板](#-反馈报告模板)
7. [常见问题](#-常见问题)

---

## 📦 环境要求

| 项目 | 最低要求 | 推荐配置 |
|------|---------|---------|
| 操作系统 | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) |
| Python | 3.11+ | 3.11 或 3.12 |
| 磁盘空间 | 500 MB | 2 GB+ |
| 内存 | 8 GB | 16 GB+ |
| GPU | 非必须（sandbox 模式） | NVIDIA GPU + CUDA 12.x（docker 模式） |
| 网络 | 需要（调用 LLM API + 文献检索） | 稳定的网络连接 |
| LLM API Key | **必须** | OpenAI 或 Anthropic |

### 🔑 关于 API Key

Pipeline 在每个阶段都会调用大语言模型（LLM）来完成写作、编码、评审等任务。你需要准备一个 **OpenAI** 或 **Anthropic** 的 API Key。

> **强烈建议使用最新、最强的模型以获得最佳效果：**
>
> | 提供商 | 推荐模型 | 备选 |
> |--------|---------|------|
> | **OpenAI** | **GPT-5.4**（首选） | GPT-5.1 或 GPT-4.1 |
> | **Anthropic** | **Claude Opus 4.6**（首选） | Claude Sonnet 4.6 |
>
> 使用顶级模型会显著提升论文写作质量、代码生成准确性和实验设计合理性。较低版本的模型（如 gpt-4o）可能导致输出质量明显下降。

---

## 🛠 安装与配置

### ⚠️ 请务必使用最新版本

> **本项目处于快速迭代阶段，** 代码更新频繁，不同版本之间的生成效果可能存在较大差异。
>
> **每次测试前，请务必拉取最新代码：**
>
> ```bash
> cd AutoResearchClaw
> git pull origin main
> pip install -e .    # 重新安装以确保更新生效
> ```
>
> 记录你的版本号，方便填写反馈报告：
> ```bash
> git log --oneline -1
> ```

---

### 方式 A：使用 Claude Code（最快 ⚡ 推荐）

如果你正在使用 [Claude Code](https://claude.ai/claude-code)（Anthropic 的 CLI 工具），直接粘贴以下内容即可：

```
请帮我克隆并安装 AutoResearchClaw 项目：
https://github.com/aiming-lab/AutoResearchClaw.git

如果已经克隆过，请先 git pull origin main 更新到最新版本。

安装完成后，帮我创建一个配置文件，使用以下参数：
- LLM: OpenAI，模型选择 gpt-5.4（或 Anthropic Claude Opus 4.6）
- 实验模式: sandbox（本地沙盒执行）
- 研究主题: "<在这里填入你的研究 idea>"
- 自动审批所有 gate stage

我的 API Key 是: sk-xxxx（请设为环境变量，不要写在配置文件里）
```

Claude Code 会自动完成克隆、安装依赖、创建配置文件、运行 Pipeline 的全部步骤。

### 方式 B：手动安装

```bash
# 1. 克隆项目
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw

# ⚠️ 如果已经克隆过，务必先更新！
# git pull origin main

# 2. 创建 Python 虚拟环境
python3 -m venv .venv
source .venv/bin/activate   # macOS / Linux
# .venv\Scripts\activate    # Windows（推荐使用 WSL2）

# 3. 安装项目
pip install -e .

# 4. 验证安装成功
researchclaw --help
```

### ⚙️ 配置文件

```bash
cp config.researchclaw.example.yaml config.yaml
```

编辑 `config.yaml`，修改以下关键字段：

```yaml
# === 项目设置 ===
project:
  name: "my-test"
  mode: "full-auto"

# === 研究主题——用英文描述你的 idea ===
research:
  topic: "你的研究 idea，用英文描述，一两句话即可"
  domains:
    - "machine-learning"    # 可选: nlp, cv, rl, graph-learning, etc.

# === LLM 配置——请使用最强模型！ ===
#
# 方案一：OpenAI（推荐 GPT-5.4）
llm:
  provider: "openai-compatible"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-5.4"              # 首选最强模型
  fallback_models:
    - "gpt-5.1"
    - "gpt-4.1"

# 方案二：Anthropic Claude（推荐 Claude Opus 4.6）
# llm:
#   provider: "openai-compatible"
#   base_url: "https://api.anthropic.com/v1"
#   api_key_env: "ANTHROPIC_API_KEY"
#   primary_model: "claude-opus-4-6"
#   fallback_models:
#     - "claude-sonnet-4-6"

# === 实验模式 ===
experiment:
  mode: "sandbox"                # sandbox = 本地执行（推荐）
  time_budget_sec: 600           # 每次实验最长运行时间（秒）
  max_iterations: 10
  metric_key: "primary_metric"
  metric_direction: "minimize"   # 或 "maximize"
```

### 🔐 设置 API Key

```bash
# OpenAI 用户：
export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx"

# Anthropic 用户：
export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx"

# 可选：Semantic Scholar API Key（可加快文献检索）
export S2_API_KEY="your-s2-key"
```

> **🔒 安全提醒：** 请勿将 API Key 硬编码在任何文件中。使用 `api_key_env` 指定环境变量名即可。

---

## 🚀 运行测试

### 快速开始

```bash
source .venv/bin/activate
export OPENAI_API_KEY="sk-xxxx"       # 或 ANTHROPIC_API_KEY

researchclaw run --config config.yaml --auto-approve
```

### 指定研究主题运行

```bash
researchclaw run \
  --config config.yaml \
  --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \
  --auto-approve
```

### ⏱ 预估运行时间

| 实验模式 | 预估时间 | 说明 |
|---------|---------|------|
| sandbox | 30 分钟 ~ 2 小时 | 取决于实验复杂度和 API 响应速度 |
| docker (GPU) | 1 ~ 4 小时 | 可运行更复杂的深度学习实验 |

运行过程中终端会实时显示当前阶段和进度。**无需任何手动操作**，安心等待即可。

### ✅ 如何知道运行结束

当看到类似以下输出时，表示 Pipeline 已成功完成：

```
[Stage 23/23] ✓ Deliverables packaged
Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/
```

### 🔄 如果运行中断

Pipeline 支持断点续跑：

```bash
researchclaw run --config config.yaml --resume
```

---

## 🔍 查看交付结果

运行结束后，输出文件位于 `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` 目录下。

### 📂 交付物清单

| 文件/目录 | 内容 |
|----------|------|
| `paper_final.md` | 最终论文（Markdown 格式，5,000–6,500 词） |
| `paper.tex` | 会议格式 LaTeX 源文件（可直接编译为 PDF） |
| `references.bib` | BibTeX 参考文献（经过引用验证） |
| `code/main.py` | 自动生成的实验代码 |
| `code/requirements.txt` | 实验代码的 Python 依赖 |
| `charts/` | 实验结果可视化图表（PNG 格式） |
| `verification_report.json` | 引用完整性验证报告 |
| `manifest.json` | 交付物清单及元信息 |

### 🔎 重点检查项

1. **论文内容**（`paper_final.md` 或 `paper.tex`）
   - 标题是否合理、与主题相关
   - 摘要是否清晰概述了问题、方法、结果
   - 相关工作是否引用了该领域的关键文献
   - 方法描述是否清晰、技术上正确
   - 实验设计是否合理（数据集、baselines、评估指标）
   - 结果是否有意义（不是全零、不是 NaN）
   - 结论是否与实验结果一致

2. **实验代码**（`code/main.py`）
   - 代码是否能独立运行
   - 是否使用了真实数据集（而非随机生成的假数据）
   - 是否实现了论文中描述的方法
   - 是否包含合理的超参数设置

3. **图表**（`charts/`）
   - 图表是否清晰可读
   - 坐标轴标签是否正确
   - 数据是否与论文描述一致

4. **引用**（`references.bib`）
   - 引用的论文是否真实存在
   - 引用是否与论文讨论的内容相关

### 📊 自动质量评估报告

Pipeline 会自动生成一份质量评估报告，位于 `stage-20/quality_report.json`，其中包含：

- `score_1_to_10` — 自动评分
- `verdict` — 接收/拒绝建议
- `strengths` — 优点列表
- `weaknesses` — 缺点列表
- `required_actions` — 建议的改进事项

请在你的反馈报告中参考此评估，并补充你自己的专业判断。

---

## 📝 反馈报告要求

**你的反馈是本项目改进的核心依据。** 无论是批评还是肯定，对我们都同样重要——请务必认真、详细地填写。

### 需要提交的内容

| # | 提交内容 | 说明 |
|---|---------|------|
| F1 | **反馈报告**（按下方模板填写） | Markdown 格式，命名为 `feedback_<你的名字>.md` |
| F2 | **完整输出目录** | 将整个 `artifacts/rc-XXXXXX/` 目录打包提交（`.zip` 或 `.tar.gz`） |
| F3 | **配置文件** | 你使用的 `config.yaml`（**删除 API Key 后**提交） |
| F4 | **终端日志**（可选但推荐） | 运行时的终端输出，便于我们排查问题 |

### 反馈的四个维度

#### 🎯 (a) 质量评价

请从你的专业领域角度评价产出论文的质量：

- 如果这是你所在领域的论文，它能达到什么水平？（顶会 / 一般会议 / 无法发表）
- 与你读过的该领域论文相比，写作质量如何？
- 方法的技术正确性如何？有无明显错误？
- 实验设计的合理性如何？

#### 💡 (b) 优化建议

请指出你认为可以改进的地方：

- 哪个阶段的输出质量最差？（文献检索 / 实验设计 / 代码生成 / 论文撰写）
- 代码中有没有明显写错或不合理的地方？
- 论文结构或表述有什么具体的改进建议？

#### ⚖️ (c) 合理性评估

请评估 Pipeline 流程的合理性：

- 23 个阶段的设计是否合理？有没有多余或缺失的步骤？
- 实验迭代优化的过程是否有效？
- LLM 生成内容的引导方式是否合理？

#### 🐛 (d) Bug 报告

请尽可能详细地报告你发现的任何问题：

- **写作 Bug**：语法错误、重复段落、前后矛盾、引用不存在的图表
- **代码 Bug**：运行报错、逻辑错误、数据处理问题
- **结果 Bug**：全零结果、NaN 值、指标不合理
- **流程 Bug**：阶段卡住、异常中断、资源耗尽

---

## 📋 反馈报告模板

请复制以下模板，填写后保存为 `feedback_<你的名字>.md`：

````markdown
# AutoResearchClaw 测试反馈报告

## 基本信息

- **测试人员**：
- **所属领域**：（例如：计算机视觉 / 自然语言处理 / 强化学习 / 生物信息 / ...）
- **测试日期**：
- **代码版本**：（运行 `git log --oneline -1` 的输出，例如：`44151b1 fix: Phase 3 regression test findings`）
- **研究主题（英文）**：
- **使用的 LLM 模型**：（例如：gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6）
- **实验模式**：（sandbox / docker）
- **运行总时长**：（约 X 分钟）
- **是否成功完成 23 个阶段**：是 / 否（如否，请说明卡在哪个阶段）

---

## 一、质量评价（总分 1-10）

**我的评分**：X / 10

### 1.1 论文整体质量
- 相当于什么级别的论文？（顶会 / 一般会议 / workshop / 无法发表）
- 简要说明评分理由：

### 1.2 各部分质量评价

| 部分 | 评分 (1-10) | 评价说明 |
|------|-----------|---------|
| 标题 | | |
| 摘要 | | |
| 引言 | | |
| 相关工作 | | |
| 方法 | | |
| 实验设计 | | |
| 结果与分析 | | |
| 结论 | | |
| 参考文献 | | |
| 图表质量 | | |
| 代码质量 | | |

### 1.3 与人工撰写论文的对比
- 与你平时阅读/撰写的论文相比，差距在哪里？
- 有哪些方面出乎意料地好？

---

## 二、优化建议

### 2.1 最需要改进的环节
（请列出 3-5 个最需要改进的具体问题，按优先级排序）

1.
2.
3.

### 2.2 代码问题
- 代码是否能独立运行？
- 是否使用了真实数据集和基线方法？
- 具体代码问题（如有）：

### 2.3 写作问题
- 论文结构是否合理？
- 技术描述是否准确？
- 具体写作问题（如有）：

---

## 三、合理性评估

### 3.1 Pipeline 流程评价
- 23 个阶段的流程设计是否合理？
- 有没有你认为多余或缺失的步骤？

### 3.2 实验执行评价
- 实验设计是否合理？（数据集选择、对比方法、评估指标）
- 迭代优化过程是否有效？

### 3.3 LLM 使用评价
- LLM 在各阶段的表现如何？
- 有没有明显的"幻觉"或不合理的生成内容？

---

## 四、Bug 报告

### 4.1 写作 Bug
| 编号 | 位置（章节/段落） | 描述 | 严重程度 (高/中/低) |
|------|-----------------|------|-------------------|
| W1 | | | |
| W2 | | | |

### 4.2 代码 Bug
| 编号 | 文件/行号 | 描述 | 严重程度 (高/中/低) |
|------|----------|------|-------------------|
| C1 | | | |
| C2 | | | |

### 4.3 结果 Bug
| 编号 | 描述 | 涉及指标/图表 | 严重程度 (高/中/低) |
|------|------|-------------|-------------------|
| R1 | | | |
| R2 | | | |

### 4.4 流程 Bug
| 编号 | 阶段 | 描述 | 严重程度 (高/中/低) |
|------|------|------|-------------------|
| P1 | | | |
| P2 | | | |

---

## 五、其他建议

（自由发挥：任何你觉得有价值的观察、建议或想法）

---

## 附件清单

- [ ] 反馈报告 (`feedback_<名字>.md`)
- [ ] 完整输出目录 (`artifacts/rc-XXXXXX.zip`)
- [ ] 配置文件 (`config.yaml`，已删除 API Key)
- [ ] 终端日志（可选）
````

---

## ❓ 常见问题

### Q1: 没有 GPU 能测试吗？

**当然可以！** 使用 `experiment.mode: "sandbox"` 模式，Pipeline 会在本地 CPU 上运行实验。虽然实验规模会受限，但足以完成一次完整的端到端测试。

### Q2: API 调用大概要花多少钱？

一次完整的 Pipeline 运行约消耗 **$5–15** 的 API 费用，取决于所选模型、论文修订次数和实验复杂度。顶级模型（GPT-5.4、Claude Opus 4.6）费用稍高，但产出质量显著更好，推荐优先使用。

### Q3: Pipeline 运行中断了怎么办？

从断点继续即可：

```bash
researchclaw run --config config.yaml --resume
```

### Q4: 可以用中文主题吗？

建议使用 **英文** 描述你的研究主题。Pipeline 的提示词、文献检索和论文生成均以英文为主。如果你的 idea 原始语言是中文，请先翻译成英文。

### Q5: 我应该选什么样的研究主题？

选择你**熟悉的领域内的一个具体研究问题**——这样你才能有效评估论文的技术正确性。建议：

- ✅ 选择有明确实验验证方法的主题（分类、回归、强化学习任务等）
- ❌ 避免过于宏大或抽象的主题（如 "AGI" 或 "通用人工智能"）
- ✅ 描述要具体，例如：*"Investigating the effect of data augmentation strategies on few-shot learning for medical image classification"*

### Q6: 如何使用 Docker 模式？（进阶）

如果你有 NVIDIA GPU 并安装了 Docker + NVIDIA Container Toolkit：

```bash
# 1. 构建实验镜像
docker build -t researchclaw/experiment:latest researchclaw/docker/

# 2. 修改 config.yaml:
#   experiment:
#     mode: "docker"
#     docker:
#       gpu_enabled: true
#       memory_limit_mb: 8192
#       network_policy: "setup_only"  # 推荐默认值

# 3. 运行
researchclaw run --config config.yaml --auto-approve
```

Docker 模式采用三阶段执行：pip install（联网）→ setup.py（联网）→ 实验代码（断网）。镜像已预缓存常用数据集（CIFAR-10/100、MNIST、FashionMNIST、STL-10、SVHN），标准基准测试无需网络。

### Q7: 我之前已经测试过了，再次测试需要注意什么？

**每次测试前务必拉取最新代码：**

```bash
cd AutoResearchClaw
git pull origin main
pip install -e .
```

然后确认版本号：

```bash
git log --oneline -1
```

不同版本的生成效果可能差异很大，请在反馈报告中注明你使用的 commit hash。

### Q8: 反馈提交到哪里？

你可以通过以下任一渠道提交反馈：

- **GitHub Issues：** [提交 Issue](https://github.com/aiming-lab/AutoResearchClaw/issues)，添加 `feedback` 标签
- **Pull Request：** 将 `feedback_<名字>.md` 提交到 `community-feedback/` 目录
- **邮件：** 联系项目维护者（详见仓库主页）

---

## 🌍 我们需要来自各个领域的测试者

目前 Pipeline 主要在机器学习领域进行了测试，我们特别欢迎来自以下领域的测试者：

- 🧬 **生物信息学与计算生物学**
- 🧪 **化学与材料科学**
- 📊 **统计学与应用数学**
- 🤖 **机器人学与控制系统**
- 🗣️ **NLP 与计算语言学**
- 👁️ **计算机视觉与图形学**
- 🎮 **强化学习与博弈论**
- 🏥 **医学 AI 与医疗健康**
- 🌐 **图学习与网络科学**
- 💹 **金融 ML 与计量经济学**
- 🛰️ **遥感与地理空间 AI**

……以及任何涉及计算实验的领域！

---

## 🙏 感谢你的参与

你的每一条反馈——无论大小——都在直接推动 AutoResearchClaw 变得更好。感谢你成为这段旅程的一部分。

<p align="center">
  <b>⭐ 如果你觉得这个项目有趣，请在 <a href="https://github.com/aiming-lab/AutoResearchClaw">GitHub</a> 上给我们一颗 Star！</b>
</p>


================================================
FILE: docs/TESTER_GUIDE_JA.md
================================================
<p align="center">
  <img src="../image/logo.png" width="500" alt="AutoResearchClaw Logo">
</p>

<h2 align="center">🧪 コミュニティテストガイド</h2>

<p align="center">
  <b>世界初の完全自律型研究パイプラインを、あらゆる分野でストレステストするためにご協力ください。</b>
</p>

<p align="center">
  <a href="https://github.com/aiming-lab/AutoResearchClaw">⭐ リポジトリにスターを付ける</a> ·
  <a href="#-クイックスタート">🚀 クイックスタート</a> ·
  <a href="#-フィードバックテンプレート">📋 フィードバックテンプレート</a> ·
  <a href="TESTER_GUIDE.md">🇺🇸 English Testing Guide</a> ·
  <a href="TESTER_GUIDE_CN.md">🇨🇳 中文测试指南</a>
</p>

---

## 👋 テスターの皆さんへ

**AutoResearchClaw** は、完全自律型の学術論文生成パイプラインです。研究アイデアを入力するだけで、文献検索、実験設計、コード生成、実験実行、論文執筆、査読、最終成果物の作成まで、すべてを自動で処理します。**23ステージ、人手介入ゼロ。**

**あらゆる分野・バックグラウンド**のテスターを募集しています — 機械学習、NLP、コンピュータビジョン、強化学習、バイオインフォマティクス、物理学、社会科学など。テストが多様であるほど、パイプラインの改善に繋がります。

**あなたのミッション：** 自分の研究アイデアでパイプラインを実行し、出力を検査して、詳細なフィードバックレポートを提出してください。それだけです。すべてのフィードバックが次のバージョンに直接反映されます。

---

## 📋 目次

1. [前提条件](#-前提条件)
2. [インストールとセットアップ](#-インストールとセットアップ)
3. [パイプラインの実行](#-パイプラインの実行)
4. [出力の確認](#-出力の確認)
5. [フィードバックレポートの要件](#-フィードバックレポートの要件)
6. [フィードバックテンプレート](#-フィードバックテンプレート)
7. [FAQ](#-faq)

---

## 📦 前提条件

| 項目 | 最小要件 | 推奨 |
|------|---------|------|
| OS | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) |
| Python | 3.11+ | 3.11 または 3.12 |
| ディスク | 500 MB | 2 GB+ |
| RAM | 8 GB | 16 GB+ |
| GPU | 不要（sandboxモード） | NVIDIA GPU + CUDA 12.x（dockerモード） |
| ネットワーク | 必要（LLM API + 文献検索） | 安定した接続 |
| LLM APIキー | **必須** | OpenAI または Anthropic |

### 🔑 APIキーについて

パイプラインは、執筆、コーディング、レビューなど、すべてのステージで大規模言語モデル（LLM）を呼び出します。**OpenAI** または **Anthropic** のAPIキーが必要です。

> **最良の結果を得るために、利用可能な最も高性能なモデルの使用を強く推奨します：**
>
> | プロバイダー | 推奨モデル | フォールバック |
> |-------------|-----------|--------------|
> | **OpenAI** | **GPT-5.4**（最良） | GPT-5.1 または GPT-4.1 |
> | **Anthropic** | **Claude Opus 4.6**（最良） | Claude Sonnet 4.6 |
>
> トップティアのモデルを使用することで、論文の品質、コードの正確性、実験設計が大幅に向上します。古いモデル（例：GPT-4o）では、出力品質が著しく低下する可能性があります。

---

## 🛠 インストールとセットアップ

### ⚠️ 常に最新バージョンを使用してください

> **このプロジェクトは活発に開発中です。** コードベースは頻繁に更新され、バージョンによって結果が大きく異なる場合があります。
>
> **テスト実行の前に、必ず最新のコードをプルしてください：**
>
> ```bash
> cd AutoResearchClaw
> git pull origin main
> pip install -e .    # 変更を反映するために再インストール
> ```
>
> フィードバックレポート用にバージョンを記録してください：
> ```bash
> git log --oneline -1
> ```

---

### オプションA：Claude Code（最速 — 推奨 ⚡）

[Claude Code](https://claude.ai/claude-code)（AnthropicのCLIツール）をお持ちの場合、以下を貼り付けるだけです：

```
Please clone and install AutoResearchClaw:
https://github.com/aiming-lab/AutoResearchClaw.git

If already cloned, run git pull origin main to update to the latest version first.

Then create a config file with:
- LLM: OpenAI with gpt-5.4 (or Anthropic Claude Opus 4.6)
- Experiment mode: sandbox (local execution)
- Research topic: "<ここに研究アイデアを入力>"
- Auto-approve all gate stages

My API key is: sk-xxxx (set it as an environment variable, don't hardcode it)
```

Claude Codeがクローン、依存関係、設定、実行をすべて自動で処理します。

### オプションB：手動インストール

```bash
# 1. リポジトリをクローン
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw

# 2. 仮想環境を作成
python3 -m venv .venv
source .venv/bin/activate       # macOS / Linux
# .venv\Scripts\activate        # Windows（WSL2推奨）

# 3. インストール
pip install -e .

# 4. 動作確認
researchclaw --help
```

### ⚙️ 設定

```bash
cp config.researchclaw.example.yaml config.yaml
```

`config.yaml` を編集してください — 主要なフィールドは以下の通りです：

```yaml
# === プロジェクト ===
project:
  name: "my-test"
  mode: "full-auto"

# === 研究トピック — アイデアを英語で記述してください ===
research:
  topic: "Your research idea in 1-2 sentences"
  domains:
    - "machine-learning"     # 選択肢: nlp, cv, rl, graph-learning など

# === LLM — 利用可能な最も高性能なモデルを使用してください！ ===
#
# オプション1: OpenAI（GPT-5.4推奨）
llm:
  provider: "openai-compatible"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-5.4"              # 最良のモデル
  fallback_models:
    - "gpt-5.1"
    - "gpt-4.1"

# オプション2: Anthropic Claude（Claude Opus 4.6推奨）
# llm:
#   provider: "openai-compatible"
#   base_url: "https://api.anthropic.com/v1"
#   api_key_env: "ANTHROPIC_API_KEY"
#   primary_model: "claude-opus-4-6"
#   fallback_models:
#     - "claude-sonnet-4-6"

# === 実験 ===
experiment:
  mode: "sandbox"                # sandbox = ローカル実行（推奨）
  time_budget_sec: 600           # 実験実行あたりの最大秒数
  max_iterations: 10
  metric_key: "primary_metric"
  metric_direction: "minimize"   # または "maximize"
```

### 🔐 APIキーの設定

```bash
# OpenAIユーザー：
export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx"

# Anthropicユーザー：
export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx"

# オプション：Semantic Scholar APIキー（文献検索を高速化）
export S2_API_KEY="your-s2-key"
```

> **🔒 セキュリティ：** APIキーをファイルにハードコードしないでください。設定ファイルの `api_key_env` を使用して環境変数を参照してください。

---

## 🚀 パイプラインの実行

### クイックスタート

```bash
source .venv/bin/activate
export OPENAI_API_KEY="sk-xxxx"       # または ANTHROPIC_API_KEY

researchclaw run --config config.yaml --auto-approve
```

### 特定のトピックを指定する場合

```bash
researchclaw run \
  --config config.yaml \
  --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \
  --auto-approve
```

### ⏱ 想定実行時間

| モード | 推定時間 | 備考 |
|--------|---------|------|
| sandbox | 30分 〜 2時間 | 実験の複雑さとAPIの速度に依存 |
| docker (GPU) | 1 〜 4時間 | より大規模なディープラーニング実験向け |

ターミナルにリアルタイムで進捗が表示されます。**手動介入は不要です** — あとは実行完了を待つだけです。

### ✅ 完了の確認方法

以下のような出力が表示されます：

```
[Stage 23/23] ✓ Deliverables packaged
Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/
```

### 🔄 中断された場合

パイプラインはチェックポイントをサポートしています — 再開するだけです：

```bash
researchclaw run --config config.yaml --resume
```

---

## 🔍 出力の確認

完了後、結果は `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` に格納されます。

### 📂 成果物

| ファイル / ディレクトリ | 説明 |
|------------------------|------|
| `paper_final.md` | Markdown形式の最終論文（5,000〜6,500語） |
| `paper.tex` | 学会投稿可能なLaTeXソース（直接コンパイル可能） |
| `references.bib` | BibTeX参考文献（検証済みの引用） |
| `code/main.py` | 自動生成された実験コード |
| `code/requirements.txt` | 実験用のPython依存関係 |
| `charts/` | 結果の可視化チャート（PNG） |
| `verification_report.json` | 引用整合性の検証レポート |
| `manifest.json` | メタデータ付きの成果物マニフェスト |

### 🔎 確認すべきポイント

1. **論文の内容** (`paper_final.md` または `paper.tex`)
   - タイトルはトピックに関連しているか？
   - アブストラクトは問題、手法、結果を明確に述べているか？
   - 関連研究はその分野の主要な論文を引用しているか？
   - 手法の記述は技術的に正確か？
   - 実験設計は妥当か（データセット、ベースライン、指標）？
   - 結果は有意義か（すべてゼロやNaNではないか）？
   - 結論は実験結果と一貫しているか？

2. **実験コード** (`code/main.py`)
   - 単独で実行できるか？
   - 実際のデータセットを使用しているか（ランダム生成の偽データではないか）？
   - 論文に記述された内容を実装しているか？
   - ハイパーパラメータは妥当か？

3. **チャート** (`charts/`)
   - 読みやすく整理されているか？
   - 軸ラベルは正しいか？
   - データは論文の主張と一致しているか？

4. **参考文献** (`references.bib`)
   - 引用された論文は実在するか？
   - 引用は議論に関連しているか？

### 📊 自動生成品質レポート

パイプラインは `stage-20/quality_report.json` に品質評価を出力します。内容は以下の通りです：

- `score_1_to_10` — 自動品質スコア
- `verdict` — 受理 / 却下の推奨
- `strengths` — 良かった点
- `weaknesses` — 特定された問題点
- `required_actions` — 改善提案

フィードバックでこれを参照し、ご自身の専門的な判断も加えてください。

---

## 📝 フィードバックレポートの要件

**あなたのフィードバックは、このプロジェクトを改善するための最も重要なインプットです。** 徹底的かつ正直に記述してください — 批判的なフィードバックも称賛と同様に価値があります。

### 提出物

| # | 項目 | 詳細 |
|---|------|------|
| F1 | **フィードバックレポート**（以下のテンプレートを使用） | Markdown形式、ファイル名は `feedback_<your-name>.md` |
| F2 | **出力ディレクトリ一式** | `artifacts/rc-XXXXXX/` ディレクトリ全体をZip圧縮 |
| F3 | **設定ファイル** | `config.yaml`（**APIキーを事前に削除してください！**） |
| F4 | **ターミナルログ**（任意だが推奨） | 実行中のターミナル出力のコピー |

### フィードバックの4つの観点

#### 🎯 (a) 品質評価

あなたの専門知識から：

- この論文があなたの分野で発表されたとしたら、どのレベルに達するか？（トップ会議 / 中堅 / ワークショップ / 出版不可）
- 普段読む論文と比較して、文章の質はどうか？
- 手法は技術的に正確か？明らかな誤りはないか？
- 実験設計は妥当か？

#### 💡 (b) 改善提案

- どのステージの出力が最も弱いか？（文献検索 / 実験設計 / コード生成 / 論文執筆）
- 明らかなコードエラーや設計上の問題はないか？
- 論文の構成や執筆の改善に関する具体的な提案は？

#### ⚖️ (c) パイプライン設計の評価

- 23ステージの設計は適切か？冗長または不足しているステップはないか？
- 反復的な実験改善は効果的か？
- 各ステージでのLLMの指示は適切か？

#### 🐛 (d) バグ報告

発見した問題をできるだけ具体的に報告してください：

- **文章のバグ：** 文法エラー、段落の繰り返し、矛盾、存在しない図への参照
- **コードのバグ：** ランタイムエラー、ロジックエラー、データ処理の問題
- **結果のバグ：** すべてゼロの結果、NaN値、不合理な指標
- **パイプラインのバグ：** ステージの停止、予期しないクラッシュ、リソース枯渇

---

## 📋 フィードバックテンプレート

以下のテンプレートをコピーし、記入して `feedback_<your-name>.md` として保存してください：

````markdown
# AutoResearchClaw — テストフィードバックレポート

## 基本情報

- **テスター名：**
- **専門分野：** （例：コンピュータビジョン / NLP / 強化学習 / バイオインフォマティクス / ...）
- **テスト日：**
- **コードバージョン：** （`git log --oneline -1` の出力、例：`44151b1 fix: Phase 3 regression test findings`）
- **研究トピック（英語）：**
- **使用したLLMモデル：** （例：gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6）
- **実験モード：** （sandbox / docker）
- **合計実行時間：** （約X分）
- **全23ステージ完了？：** はい / いいえ（いいえの場合、どのステージで失敗？）

---

## 1. 品質評価（スコア：1〜10）

**私のスコア：** X / 10

### 1.1 論文全体の品質
- この論文はどのレベルに相当するか？（トップ会議 / 中堅 / ワークショップ / 出版不可）
- スコアの理由：

### 1.2 セクション別評価

| セクション | スコア (1-10) | コメント |
|-----------|-------------|---------|
| タイトル | | |
| アブストラクト | | |
| イントロダクション | | |
| 関連研究 | | |
| 手法 | | |
| 実験設計 | | |
| 結果と分析 | | |
| 結論 | | |
| 参考文献 | | |
| チャート / 図表 | | |
| コード品質 | | |

### 1.3 人間が書いた論文との比較
- 普段読み書きする論文と比較して、どこにギャップがあるか？
- 意外に良かった点は？

---

## 2. 改善提案

### 2.1 主要な問題点（優先順位で3〜5つ）

1.
2.
3.

### 2.2 コードの問題
- コードは単独で実行できるか？
- 実際のデータセットとベースラインを使用しているか？
- 具体的なコードの問題（もしあれば）：

### 2.3 文章の問題
- 論文の構成は妥当か？
- 技術的な記述は正確か？
- 具体的な文章の問題（もしあれば）：

---

## 3. パイプライン設計の評価

### 3.1 パイプラインフロー
- 23ステージの設計は妥当か？
- 冗長または不足しているステップはないか？

### 3.2 実験実行
- 実験設計は妥当か？（データセットの選択、比較手法、指標）
- 反復的な改善は効果的か？

### 3.3 LLMの使用
- 各ステージでのLLMのパフォーマンスはどうか？
- 明らかな「ハルシネーション」や不合理な出力はないか？

---

## 4. バグ報告

### 4.1 文章のバグ
| # | 場所（セクション/段落） | 説明 | 重要度（高/中/低） |
|---|------------------------|------|-------------------|
| W1 | | | |
| W2 | | | |

### 4.2 コードのバグ
| # | ファイル / 行 | 説明 | 重要度（高/中/低） |
|---|--------------|------|-------------------|
| C1 | | | |
| C2 | | | |

### 4.3 結果のバグ
| # | 説明 | 影響を受ける指標/チャート | 重要度（高/中/低） |
|---|------|--------------------------|-------------------|
| R1 | | | |
| R2 | | | |

### 4.4 パイプラインのバグ
| # | ステージ | 説明 | 重要度（高/中/低） |
|---|---------|------|-------------------|
| P1 | | | |
| P2 | | | |

---

## 5. その他のコメント

（自由記述：有益と思われる観察、アイデア、提案など）

---

## 添付チェックリスト

- [ ] フィードバックレポート (`feedback_<name>.md`)
- [ ] 出力ディレクトリ一式 (`artifacts/rc-XXXXXX.zip`)
- [ ] 設定ファイル (`config.yaml`、APIキー削除済み)
- [ ] ターミナルログ（任意）
````

---

## ❓ FAQ

### Q1: GPUなしでテストできますか？

**はい！** `experiment.mode: "sandbox"` を使用してください — パイプラインはCPU上で実験を実行します。実験はシンプルになりますが、エンドツーエンドの完全なテストには十分です。

### Q2: API呼び出しの費用はどのくらいですか？

パイプラインの完全な実行は、モデル、修正反復回数、実験の複雑さに応じて、APIの費用が約**$5〜15**かかります。トップティアのモデル（GPT-5.4、Claude Opus 4.6）はやや高価ですが、大幅に良い結果を生成します。

### Q3: パイプラインが実行中にクラッシュした場合は？

チェックポイントから再開してください：

```bash
researchclaw run --config config.yaml --resume
```

### Q4: 英語以外の研究トピックを使用できますか？

トピックは**英語**で記述することを推奨します。パイプラインのプロンプト、文献検索、論文生成はすべて英語ベースです。アイデアが他の言語の場合は、事前に翻訳してください。

### Q5: どのような研究トピックを選べばよいですか？

**自分がよく知っている分野の具体的な研究課題**を選んでください — そうすることで、出力が技術的に正確かどうかを意味のある形で評価できます。ヒント：

- ✅ 明確な実験的検証があるトピックを選ぶ（分類、回帰、強化学習タスクなど）
- ❌ 過度に広範または抽象的なトピックは避ける（例：「AGI」、「汎用知能」）
- ✅ 具体的に：*"医用画像分類におけるFew-shot学習に対するデータ拡張戦略の効果の調査"*

### Q6: Dockerモードの使用方法は？（上級者向け）

NVIDIA GPUとDocker + NVIDIA Container Toolkitがある場合：

```bash
# 1. 実験用イメージをビルド
docker build -t researchclaw/experiment:latest researchclaw/docker/

# 2. config.yamlを更新：
#   experiment:
#     mode: "docker"
#     docker:
#       gpu_enabled: true
#       memory_limit_mb: 8192
#       network_policy: "setup_only"  # 推奨デフォルト

# 3. 実行
researchclaw run --config config.yaml --auto-approve
```

Dockerモードは3フェーズの実行モデルを使用します：pip install（ネットワーク有効）→ setup.py（ネットワーク有効）→ 実験（ネットワーク無効）。イメージにはプリキャッシュされたデータセット（CIFAR-10/100、MNIST、FashionMNIST、STL-10、SVHN）が含まれているため、標準的なベンチマークはネットワークアクセスなしで動作します。

### Q7: 以前テストしましたが、再テストの場合はどうすればよいですか？

テストの前に**必ず最新のコードをプル**してください：

```bash
cd AutoResearchClaw
git pull origin main
pip install -e .
```

バージョンを確認してください：

```bash
git log --oneline -1
```

バージョンが異なると、結果が大きく変わる可能性があります。フィードバックレポートには必ずコミットハッシュを記載してください。

### Q8: フィードバックはどこに提出しますか？

フィードバックレポートと添付ファイルは、以下のいずれかの方法で提出してください：

- **GitHub Issues：** [Issueを作成](https://github.com/aiming-lab/AutoResearchClaw/issues)し、`feedback` ラベルを付ける
- **Pull Request：** `feedback_<name>.md` を `community-feedback/` ディレクトリに提出
- **メール：** プロジェクトのメンテナーに連絡（詳細はリポジトリを参照）

---

## 🌍 あらゆる分野のテスターを募集しています

パイプラインはこれまで主にML関連のトピックでテストされてきました。特に以下の分野のテスターを歓迎します：

- 🧬 **バイオインフォマティクス・計算生物学**
- 🧪 **化学・材料科学**
- 📊 **統計学・応用数学**
- 🤖 **ロボティクス・制御システム**
- 🗣️ **NLP・計算言語学**
- 👁️ **コンピュータビジョン・グラフィックス**
- 🎮 **強化学習・ゲーム理論**
- 🏥 **医療AI・ヘルスケア**
- 🌐 **グラフ学習・ネットワーク科学**
- 💹 **金融ML・計量経済学**
- 🛰️ **リモートセンシング・地理空間AI**

...その他、計算実験が関わるあらゆる分野！

---

## 🙏 ありがとうございます

大小問わず、すべてのフィードバックがAutoResearchClawの改善に直接つながります。この取り組みに参加していただき、ありがとうございます。

<p align="center">
  <b>⭐ このプロジェクトに興味を持たれたら、<a href="https://github.com/aiming-lab/AutoResearchClaw">GitHub</a>でスターをお願いします！</b>
</p>


================================================
FILE: docs/agent_figure_and_benchmark_plan.md
================================================
# Multi-Agent Figure Generation & Benchmark Selection — Task Requirements

> **Created**: 2026-03-15
> **Updated**: 2026-03-15
> **Status**: BenchmarkAgent IMPLEMENTED, FigureAgent IMPLEMENTED
> **Scope**: Two new multi-agent subsystems for AutoResearchClaw pipeline
>
> **Implementation Progress**:
> - [x] Part B: BenchmarkAgent — fully implemented (4 agents + orchestrator + config + pipeline integration + 43 tests)
> - [x] Part A: FigureAgent — fully implemented (5 agents + orchestrator + config + pipeline integration + 45 tests)
>
> **Key Research Findings (supplemental)**:
> - Papers With Code was shut down by Meta in July 2025; HuggingFace Hub API is now the primary dataset discovery source
> - AI Scientist v2 and MLR-Copilot both use pure LLM-driven dataset selection (no API search) — our API-based approach is more structured
> - MLE-bench (OpenAI) validates the pre-download + container-mount pattern (matches our `setup_only` network policy)
> - CodeSOTA (codesota.com) provides a lighter-weight benchmark database as an alternative to Papers With Code

---

## Executive Summary

当前 Pipeline 的图表生成和数据集/基准选择存在根本性缺陷：

**图表问题**（实测产出）：
- 每次固定只生成 2 张图（`method_comparison.png` + `experiment_comparison.png`）
- 图表类型单一：只有柱状图，无训练曲线、热力图、消融分析图等
- 数据无差异化：所有方法都显示 1.000，完全无信息量
- 样式简陋：默认 matplotlib 风格，远低于 AI 顶会标准
- 不适应实验内容：无论做什么研究都画一样的图
- DPI=150，不满足出版要求（300+ DPI）

**数据集/基准问题**：
- 当前仅通过 `dataset_guidance` 提示词列出预缓存数据集
- 无法根据研究领域动态搜索和选择最合适的 benchmark
- 无法自动下载非预缓存数据集
- 缺乏 baseline 方法的自动复现能力

**解决方案**：设计两个独立的多 Agent 子系统：
1. **FigureAgent** — 智能图表生成系统（6 个子 Agent 协作）
2. **BenchmarkAgent** — 数据集与基准选择系统（4 个子 Agent 协作）

---

## Part A: FigureAgent — 多 Agent 图表生成系统

### A.1 问题分析

#### 当前架构缺陷

```
现状：Stage 14 → visualize.py (5 个硬编码函数) → 固定 2 张图 → Stage 17/22 嵌入论文
```

| 问题 | 严重程度 | 说明 |
|------|---------|------|
| 图表类型固定 | Critical | 只有 bar chart 和 line chart，缺少 heatmap、scatter、violin、architecture diagram 等 |
| 不适应实验内容 | Critical | 知识蒸馏实验和 RL 实验画的图完全一样 |
| 无智能决策 | Critical | 不分析"应该画什么"，直接调用固定函数 |
| 数据正确性无验证 | High | 不验证图中数据是否与实验结果一致 |
| 样式不达标 | High | 默认 matplotlib，不符合学术论文视觉标准 |
| 无架构图能力 | High | 不能生成方法流程图 / 模型架构图（顶会 Figure 1 必备） |
| DPI 不足 | Medium | 150 DPI，出版要求 300+ |
| 无 VLM 审查 | Medium | 生成后不检查质量，直接用 |

#### 业界参考方案

| 项目 | 图表策略 | 核心创新 |
|------|---------|---------|
| AI Scientist v1 (Sakana) | 人工编写 `plot.py` 模板，LLM 不参与 | 可靠但不灵活 |
| AI Scientist v2 (Sakana) | LLM 自主生成画图代码 + VLM 审查反馈循环 | **VLM-as-critic**，首篇通过 ICLR workshop 审稿 |
| PlotGen (Adobe) | 三模态反馈：数值准确性 + 文本正确性 + 视觉质量 | **Tri-modal feedback**，MatPlotBench 最优 |
| PaperBanana (Google) | 3 阶段 pipeline：Caption 精炼 → 参考检索 → 迭代渲染 | **Caption sharpening** + 参考图库 |

### A.2 目标架构

```
                          ┌─────────────────────┐
                          │   FigureAgent        │
                          │   (Orchestrator)     │
                          └──────────┬──────────┘
                                     │
              ┌──────────┬───────────┼───────────┬──────────┐
              ▼          ▼           ▼           ▼          ▼
        ┌──────────┐┌──────────┐┌──────────┐┌──────────┐┌──────────┐
        │ Planner  ││ CodeGen  ││ Renderer ││ Critic   ││ Integra- │
        │ Agent    ││ Agent    ││ Agent    ││ Agent    ││ tor Agent│
        └──────────┘└──────────┘└──────────┘└──────────┘└──────────┘
              │          │           │           │          │
              ▼          ▼           ▼           ▼          ▼
         图表规划     代码生成     执行渲染     质量审查    论文嵌入
```

#### Agent 职责定义

**1. Orchestrator（编排器）**
- 接收：实验结果 JSON、论文草稿 markdown、研究主题描述
- 协调所有子 Agent 的执行顺序
- 管理迭代循环（Critic 不满意时回到 CodeGen）
- 输出：最终图表集合 + 嵌入指令

**2. Planner Agent（图表规划）**
- 输入：实验结果数据结构、论文 idea、研究领域
- 职责：
  - 分析实验数据，确定需要哪些图、每张图展示什么
  - 为每张图生成精确的 caption specification（非模糊描述）
  - 确定图表类型（bar / line / heatmap / scatter / architecture / ablation 等）
  - 确定布局（single / subplot / multi-panel）
  - 输出图表规划清单（JSON 格式）
- 关键规则：
  - 至少规划 4 张图：1 架构图 + 1 主结果图 + 1 消融图 + 1 分析图
  - 根据研究领域自动选择合适的图表类型
  - Caption sharpening：将模糊描述转化为精确视觉规范

**3. CodeGen Agent（代码生成）**
- 输入：Planner 输出的图表规划 + 实验数据
- 职责：
  - 为每张图生成独立的 Python 绘图脚本
  - 使用 SciencePlots 学术样式 (`plt.style.use(['science', 'ieee'])`)
  - 确保 colorblind-safe 配色
  - 300+ DPI 输出
  - 代码保存到 `charts/scripts/` 供复现
- 代码模板库：
  - 内置常用学术图表模板（training curve, bar comparison, heatmap, confusion matrix 等）
  - 新图表可基于模板扩展

**4. Renderer Agent（渲染执行）**
- 输入：CodeGen 生成的 Python 脚本
- 职责：
  - 在 Docker sandbox 中执行绘图脚本
  - 捕获执行错误并反馈给 CodeGen 修复
  - 验证输出文件存在且可读
  - 检查图像尺寸和分辨率

**5. Critic Agent（质量审查 — 三模态反馈）**
- 输入：渲染后的图像 + 源数据 + caption 规范
- 职责（三维度审查，参考 PlotGen）：
  - **数值准确性**：验证图中呈现的数值与源数据一致（读取 JSON → 对比图中数据点）
  - **文本正确性**：检查标题、坐标轴标签、图例是否准确完整
  - **视觉质量**：通过 VLM（如 GPT-4o vision）审查整体美观度、可读性、学术规范
- 输出：pass / fail + 具体修改建议
- 如果 fail：将反馈发回 CodeGen Agent，最多迭代 3 次

**6. Integrator Agent（论文嵌入）**
- 输入：通过审查的图表集合 + 论文草稿
- 职责：
  - 确定每张图在论文中的最佳位置
  - 生成 LaTeX figure 环境代码（支持 subfigure 多面板）
  - 生成交叉引用（`\ref{fig:xxx}`）
  - 确保图表在正确的 section（架构图在 Method，结果图在 Results）
  - 更新论文文本中的图表引用语句

### A.3 图表类型矩阵

根据研究领域和实验类型，Planner Agent 应遵循以下决策矩阵：

| 实验类型 | 必须包含的图表 | 可选图表 |
|---------|--------------|---------|
| **分类任务** | 精度对比 bar chart、confusion matrix | ROC 曲线、t-SNE 可视化 |
| **生成模型** | 生成样本 grid、FID/IS 曲线 | 插值可视化、attention map |
| **强化学习** | reward curve (mean±std shading)、episode length | 策略可视化、环境截图 |
| **知识蒸馏** | teacher-student 精度对比、知识迁移效率曲线 | 特征对齐热力图 |
| **NLP** | BLEU/ROUGE 对比表、attention heatmap | 样本输出对比 |
| **图神经网络** | 节点分类精度、图可视化 | 消息传递可视化 |
| **元学习** | few-shot 精度 vs shot 数曲线 | 任务适应速度 |
| **持续学习** | 遗忘率曲线、任务精度矩阵 | 表征漂移可视化 |
| **所有类型** | 消融分析 (grouped bar)、训练 loss 曲线 | 超参敏感性热力图 |

### A.4 样式规范

所有图表必须遵循以下学术出版标准：

```python
# 全局样式配置 (charts/style_config.py)
STYLE_CONFIG = {
    "matplotlib_style": ["science", "ieee"],   # SciencePlots
    "dpi": 300,                                 # 出版级
    "font_size": {"title": 12, "axis": 10, "tick": 8, "legend": 9},
    "figure_width": {
        "single_column": 3.5,   # IEEE single column (inches)
        "double_column": 7.0,   # IEEE double column
        "full_page": 7.0,       # Full width
    },
    "colors": "bright",          # colorblind-safe (Paul Tol)
    "line_styles": ["-", "--", "-.", ":"],  # 配合 B&W 打印
    "marker_styles": ["o", "s", "^", "D", "v", "P"],
    "error_bar_style": "shading",  # mean ± std 用阴影而非 error bar
    "format": "pdf",               # 矢量格式优先
    "fallback_format": "png",      # PNG 备用
}
```

### A.5 实现计划

#### 文件结构

```
researchclaw/
├── agents/
│   └── figure_agent/
│       ├── __init__.py
│       ├── orchestrator.py      # FigureAgent 主编排器
│       ├── planner.py           # Planner Agent
│       ├── codegen.py           # CodeGen Agent
│       ├── renderer.py          # Renderer Agent
│       ├── critic.py            # Critic Agent (三模态审查)
│       ├── integrator.py        # Integrator Agent
│       ├── templates/           # 图表代码模板库
│       │   ├── bar_comparison.py
│       │   ├── training_curve.py
│       │   ├── heatmap.py
│       │   ├── confusion_matrix.py
│       │   ├── scatter_plot.py
│       │   ├── ablation_grouped.py
│       │   ├── violin_box.py
│       │   └── multi_panel.py
│       └── style_config.py      # 全局样式配置
```

#### 开发任务清单

| ID | 任务 | 依赖 | 估计改动量 |
|----|------|------|-----------|
| FA-01 | 创建 `agents/figure_agent/` 目录结构和基础类 | 无 | 新建 |
| FA-02 | 实现 Planner Agent：图表规划逻辑 + 类型决策矩阵 | FA-01 | ~300 行 |
| FA-03 | 实现 CodeGen Agent：代码生成 + 模板库 | FA-01 | ~500 行 |
| FA-04 | 实现 Renderer Agent：sandbox 执行 + 错误处理 | FA-01, FA-03 | ~200 行 |
| FA-05 | 实现 Critic Agent：三模态审查（数值 / 文本 / VLM） | FA-01, FA-04 | ~400 行 |
| FA-06 | 实现 Integrator Agent：论文嵌入 + LaTeX subfigure 支持 | FA-01 | ~250 行 |
| FA-07 | 实现 Orchestrator：编排循环 + 最大迭代控制 | FA-02 ~ FA-06 | ~300 行 |
| FA-08 | 添加 SciencePlots 到 Docker 镜像 + 样式配置 | 无 | ~50 行 |
| FA-09 | 修改 executor.py：Stage 14 调用 FigureAgent 替代 `visualize.py` | FA-07 | ~100 行 |
| FA-10 | 修改 executor.py：Stage 17/22 使用 Integrator 输出 | FA-07 | ~100 行 |
| FA-11 | 修改 converter.py：支持 subfigure 和 PDF 格式 | FA-06 | ~80 行 |
| FA-12 | 添加图表代码模板库（8+ 模板） | FA-03 | ~600 行 |
| FA-13 | 测试：单元测试 + 集成测试 | FA-01 ~ FA-12 | ~400 行 |
| FA-14 | 向后兼容：保留 `visualize.py` 作为 fallback | FA-09 | ~30 行 |

#### Pipeline 集成点

```
Stage 12-13: 实验执行完成，生成 results.json
      │
      ▼
Stage 14: Result Analysis
      │── 调用 FigureAgent.orchestrate()
      │   ├── Planner: 分析 results.json → 图表规划
      │   ├── CodeGen: 生成绘图脚本 → charts/scripts/
      │   ├── Renderer: 执行脚本 → charts/*.pdf + charts/*.png
      │   ├── Critic: 审查图表质量 (max 3 iterations)
      │   └── 输出: charts/ 目录 + figure_manifest.json
      │
      ▼
Stage 17: Paper Draft
      │── Integrator: 读取 figure_manifest.json
      │   ├── 确定每张图的论文位置
      │   ├── 注入 markdown 图片引用 + caption
      │   └── 更新交叉引用文本
      │
      ▼
Stage 22: Paper Export
      │── 复制 charts/ 到 submission/
      │── converter.py 处理 subfigure 环境
      └── 最终 LaTeX 编译验证
```

---

## Part B: BenchmarkAgent — 多 Agent 数据集与基准选择系统

### B.1 问题分析

#### 当前架构缺陷

```
现状：dataset_guidance 提示词 (硬编码列表) + dataset_registry.yaml (静态清单) → LLM 自行选择
```

| 问题 | 严重程度 | 说明 |
|------|---------|------|
| 数据集选择不智能 | Critical | 仅列出预缓存数据集，LLM 可能选择不合适的 benchmark |
| 无领域适配 | Critical | 不根据研究领域搜索该领域的标准 benchmark |
| 无最新性保证 | High | 不检查是否有更新、更好的 benchmark 可用 |
| baseline 无法复现 | High | 不提供已有方法的参考实现 / 预训练权重 |
| 下载路径硬编码 | Medium | 非预缓存数据集无法自动获取 |
| 无数据集验证 | Medium | 不验证下载的数据集是否完整、格式正确 |

#### 理想工作流

一个好的数据集/基准选择流程应该：
1. **理解研究问题** → 确定评估维度（分类精度？生成质量？推理速度？）
2. **搜索领域标准** → 查找该领域顶会论文常用的 benchmark
3. **评估适用性** → 数据集大小、难度、License、可获取性
4. **获取数据** → 自动下载或生成下载脚本
5. **获取 baseline** → 找到对比方法的开源实现或预训练权重
6. **验证完整性** → 确认数据集可正常加载和使用

### B.2 目标架构

```
                          ┌─────────────────────┐
                          │  BenchmarkAgent      │
                          │  (Orchestrator)      │
                          └──────────┬──────────┘
                                     │
              ┌──────────┬───────────┼───────────┐
              ▼          ▼           ▼           ▼
        ┌──────────┐┌──────────┐┌──────────┐┌──────────┐
        │ Surveyor ││ Selector ││ Acquirer ││ Validator│
        │ Agent    ││ Agent    ││ Agent    ││ Agent    │
        └──────────┘└──────────┘└──────────┘└──────────┘
              │          │           │           │
              ▼          ▼           ▼           ▼
         领域调研     选择决策     数据获取     验证确认
```

#### Agent 职责定义

**1. Orchestrator（编排器）**
- 接收：研究主题、假设、实验设计方案
- 协调 4 个子 Agent 的执行
- 输出：`benchmark_plan.json`（包含数据集列表、下载脚本、baseline 方案）

**2. Surveyor Agent（领域调研）**
- 输入：研究主题关键词、相关文献列表
- 职责：
  - 搜索 Papers With Code 的领域 benchmark 排行榜
  - 搜索 HuggingFace Datasets 的相关数据集
  - 搜索 OpenML、Kaggle 的相关 benchmark
  - 分析近 2 年顶会论文（ICML、NeurIPS、ICLR）使用的数据集
  - 汇总领域标准 benchmark 清单（含引用频次、数据规模、难度级别）
- 输出：`survey_results.json` — 候选 benchmark 列表（按推荐度排序）
- 数据源优先级：
  1. Papers With Code (Benchmarks API)
  2. HuggingFace Datasets Hub
  3. torchvision / torchaudio / torchtext 内置
  4. 顶会论文附录中的数据集描述

**3. Selector Agent（选择决策）**
- 输入：survey_results.json + 实验约束（GPU 内存、时间预算、网络可用性）
- 职责：
  - 根据约束过滤不可行的数据集（太大 / 需要申请 / License 不兼容）
  - 考虑 Docker sandbox 已缓存的数据集（优先使用）
  - 选择 primary benchmark（必须是领域标准）+ secondary benchmarks（补充验证）
  - 选择 baseline 方法（至少 2 个有开源实现的对比方法）
  - 生成选择理由文档（供论文 Experimental Setup section 使用）
- 约束规则：
  - Tier 1（已缓存）：无网络需求，最优先
  - Tier 2（torchvision/HF datasets 可直接下载）：需 setup 阶段网络
  - Tier 3（需自定义下载脚本）：仅在 `network_policy: full` 时可用
- 输出：`selected_benchmarks.json` + `baseline_methods.json`

**4. Acquirer Agent（数据获取）**
- 输入：selected_benchmarks.json
- 职责：
  - 生成 `setup.py` 中的数据集下载代码
  - 为每个数据集生成加载 boilerplate 代码
  - 为 baseline 方法生成安装和调用代码
  - 处理 HuggingFace `datasets.load_dataset()` / `torchvision.datasets` 等接口
  - 生成 `requirements.txt` 中需要额外安装的包
- 输出：
  - `data_loading_snippets.py` — 数据加载代码片段（注入 CodeAgent）
  - `baseline_snippets.py` — baseline 调用代码片段
  - `setup.py` 追加内容 — 下载脚本

**5. Validator Agent（验证确认）**
- 输入：Acquirer 生成的下载/加载代码
- 职责：
  - 验证数据集 API 调用语法正确
  - 验证数据集分割（train/val/test）存在
  - 验证数据格式与实验代码兼容
  - 验证 baseline 方法可运行
  - 如果验证失败，反馈给 Acquirer 修复
- 输出：validation_report.json

### B.3 知识库设计

BenchmarkAgent 需要一个结构化知识库来支持决策：

```yaml
# researchclaw/data/benchmark_knowledge.yaml

domains:
  image_classification:
    standard_benchmarks:
      - name: CIFAR-10/100
        source: torchvision
        tier: 1  # 已缓存
        difficulty: easy/medium
        use_when: "小规模验证、快速原型"
      - name: ImageNet-1K
        source: torchvision
        tier: 3  # 需要下载 ~150GB
        difficulty: hard
        use_when: "大规模验证、与 SOTA 对比"
    common_baselines:
      - name: ResNet-50
        source: "torchvision.models.resnet50(pretrained=True)"
        paper: "He et al., 2016"
      - name: ViT-B/16
        source: "timm.create_model('vit_base_patch16_224', pretrained=True)"
        paper: "Dosovitskiy et al., 2021"

  reinforcement_learning:
    standard_benchmarks:
      - name: Gymnasium (MuJoCo)
        source: "gymnasium[mujoco]"
        tier: 2
      - name: Atari
        source: "gymnasium[atari]"
        tier: 2
    common_baselines:
      - name: PPO
        source: "stable-baselines3"
        paper: "Schulman et al., 2017"

  # ... 更多领域
```

### B.4 实现计划

#### 文件结构

```
researchclaw/
├── agents/
│   └── benchmark_agent/
│       ├── __init__.py
│       ├── orchestrator.py      # BenchmarkAgent 主编排器
│       ├── surveyor.py          # Surveyor Agent (领域调研)
│       ├── selector.py          # Selector Agent (选择决策)
│       ├── acquirer.py          # Acquirer Agent (数据获取)
│       ├── validator.py         # Validator Agent (验证确认)
│       └── knowledge_base.py    # 知识库加载和查询
├── data/
│   ├── benchmark_knowledge.yaml # 领域 benchmark 知识库
│   └── dataset_registry.yaml    # 已有数据集注册表 (保留)
```

#### 开发任务清单

| ID | 任务 | 依赖 | 估计改动量 |
|----|------|------|-----------|
| BA-01 | 创建 `agents/benchmark_agent/` 目录结构和基础类 | 无 | 新建 |
| BA-02 | 编写 `benchmark_knowledge.yaml` 知识库（覆盖 10+ 领域） | 无 | ~500 行 YAML |
| BA-03 | 实现 Surveyor Agent：Papers With Code API + HF Datasets 搜索 | BA-01 | ~350 行 |
| BA-04 | 实现 Selector Agent：约束过滤 + Tier 匹配 + 选择逻辑 | BA-01, BA-02 | ~300 行 |
| BA-05 | 实现 Acquirer Agent：代码生成 + 下载脚本 | BA-01, BA-04 | ~350 行 |
| BA-06 | 实现 Validator Agent：语法/可用性验证 | BA-01, BA-05 | ~250 行 |
| BA-07 | 实现 Orchestrator：编排 + 迭代修复 | BA-02 ~ BA-06 | ~250 行 |
| BA-08 | 修改 executor.py：Stage 6/7 调用 BenchmarkAgent | BA-07 | ~150 行 |
| BA-09 | 修改 executor.py：将 benchmark_plan 注入 CodeAgent | BA-07 | ~100 行 |
| BA-10 | 更新 prompts.py：基于 BenchmarkAgent 输出动态构建提示词 | BA-07 | ~100 行 |
| BA-11 | 测试：单元测试 + 集成测试 | BA-01 ~ BA-10 | ~300 行 |
| BA-12 | 向后兼容：保留 `dataset_registry.yaml` 作为 fallback | BA-08 | ~30 行 |

#### Pipeline 集成点

```
Stage 3: Topic Initialization
      │── 研究主题确定
      ▼
Stage 4-5: Literature Collection & Screening
      │── 文献列表生成
      ▼
Stage 6: Hypothesis Generation
      │── 调用 BenchmarkAgent.orchestrate()
      │   ├── Surveyor: 搜索领域标准 benchmark
      │   ├── Selector: 根据约束选择最优 benchmark + baseline
      │   ├── Acquirer: 生成下载/加载代码
      │   └── Validator: 验证代码可执行
      │── 输出: benchmark_plan.json
      ▼
Stage 7: Experiment Design
      │── benchmark_plan.json 注入实验设计
      │── 实验方案明确使用哪些数据集和 baseline
      ▼
Stage 8-9: Code Generation (CodeAgent)
      │── data_loading_snippets 注入生成代码
      │── baseline_snippets 注入对比方法
      ▼
Stage 10-11: Experiment Execution
      │── setup.py 执行数据集下载
      │── main.py 使用生成的数据加载代码
      ▼
Stage 14: Result Analysis
      │── 对比结果基于真实 baseline，可信度高
```

---

## Part C: 共同基础设施

### C.1 Agent 基类

两个多 Agent 系统共享同一套基础设施：

```python
# researchclaw/agents/base.py

class BaseAgent:
    """所有子 Agent 的基类"""
    def __init__(self, llm_client, config):
        self.llm = llm_client
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

    async def execute(self, context: dict) -> dict:
        """执行 Agent 任务，返回结果"""
        raise NotImplementedError

    def _call_llm(self, system_prompt, user_prompt, **kwargs):
        """统一 LLM 调用接口"""
        return self.llm.chat(system_prompt, user_prompt, **kwargs)


class AgentOrchestrator:
    """Agent 编排器基类"""
    def __init__(self, agents: list[BaseAgent], max_iterations=3):
        self.agents = agents
        self.max_iterations = max_iterations

    async def orchestrate(self, context: dict) -> dict:
        """执行多 Agent 编排流程"""
        raise NotImplementedError
```

### C.2 与现有 LLM Client 的集成

两个系统都通过现有的 `researchclaw/llm/client.py` 调用 LLM：
- Planner / Selector / Critic 等决策类 Agent → 使用 `gpt-4.1` 或 `gpt-4o`
- CodeGen 类 Agent → 使用 `gpt-4.1`（代码生成能力最强）
- VLM Critic → 使用 `gpt-4o`（支持 vision）

### C.3 配置扩展

```yaml
# config.yaml 新增配置
agents:
  figure_agent:
    enabled: true
    max_iterations: 3          # Critic 反馈最大迭代次数
    min_figures: 4             # 最少图表数
    style: "science+ieee"      # matplotlib 样式
    dpi: 300
    format: "pdf"              # 优先格式
    vlm_review: true           # 是否启用 VLM 视觉审查
  benchmark_agent:
    enabled: true
    max_search_results: 20     # Papers With Code 最大搜索结果
    prefer_cached: true        # 优先使用已缓存数据集
    tier_limit: 2              # 最高允许的 Tier 级别 (1=缓存, 2=可下载, 3=大型)
    min_baselines: 2           # 最少 baseline 方法数
```

---

## Part D: 风险与兜底

### D.1 向后兼容

| 组件 | 兜底策略 |
|------|---------|
| FigureAgent 失败 | 回退到现有 `visualize.py` 生成基础图表 |
| BenchmarkAgent 失败 | 回退到 `dataset_registry.yaml` + `dataset_guidance` 提示词 |
| VLM 审查不可用 | 跳过视觉审查，仅做数值 + 文本验证 |
| SciencePlots 未安装 | 使用 `seaborn-v0_8-whitegrid` 样式 |
| 网络不可用 | Surveyor 使用本地 `benchmark_knowledge.yaml` |

### D.2 Token 成本控制

| 操作 | 预估 Token 消耗 | 控制策略 |
|------|----------------|---------|
| Planner (1 次) | ~2K input + ~1K output | 固定 |
| CodeGen (4 图 × 最多 3 次迭代) | ~3K × 12 = ~36K | 迭代次数上限 |
| Critic (4 图 × 最多 3 次) | ~2K × 12 = ~24K | 迭代次数上限 |
| VLM 审查 (4 图) | ~4K × 4 = ~16K | 仅终轮审查 |
| Surveyor (1 次) | ~2K input + ~2K output | API 调用为主 |
| Selector (1 次) | ~3K input + ~1K output | 固定 |
| **总增量** | **~80K tokens** | 约增加 $0.30-0.50/run |

### D.3 测试策略

1. **单元测试**：每个 Agent 独立测试（mock LLM 响应）
2. **集成测试**：使用固定 results.json 测试 FigureAgent 完整流程
3. **回归测试**：确认 fallback 到旧系统仍可正常工作
4. **端到端测试**：Run 14+ 完整 Pipeline 运行，对比图表质量

---

## Part E: 执行优先级

建议按以下顺序实施：

### Phase 1: FigureAgent 核心（优先级最高）
1. FA-01 ~ FA-03: 基础类 + Planner + CodeGen
2. FA-04 ~ FA-05: Renderer + Critic
3. FA-08: SciencePlots 集成
4. FA-12: 模板库

### Phase 2: FigureAgent 集成
5. FA-06 ~ FA-07: Integrator + Orchestrator
6. FA-09 ~ FA-11: Pipeline 集成
7. FA-13 ~ FA-14: 测试 + 向后兼容

### Phase 3: BenchmarkAgent 核心
8. BA-01 ~ BA-02: 基础类 + 知识库
9. BA-03 ~ BA-06: 4 个子 Agent
10. BA-07: Orchestrator

### Phase 4: BenchmarkAgent 集成
11. BA-08 ~ BA-10: Pipeline 集成
12. BA-11 ~ BA-12: 测试 + 向后兼容

### Phase 5: 端到端验证
13. 完整 Pipeline 运行（Run 14+）
14. 对比图表质量和数据集选择质量
15. 根据结果调优

---

## Appendix: 参考资料

| 来源 | 关键收获 |
|------|---------|
| [AI Scientist v2](https://github.com/SakanaAI/AI-Scientist-v2) | VLM-as-critic, 首篇通过 ICLR workshop 审稿 |
| [PlotGen (Adobe)](https://arxiv.org/abs/2502.00988) | 三模态反馈：数值 + 文本 + 视觉 |
| [PaperBanana (Google)](https://github.com/llmsresearch/paperbanana) | Caption sharpening + 参考图库检索 |
| [SciencePlots](https://github.com/garrettj403/SciencePlots) | 学术论文 matplotlib 样式库 |
| [VLM-Enhanced Discovery](https://arxiv.org/html/2511.14631) | Correction mode + Discovery mode |
| [Papers With Code API](https://paperswithcode.com/api/v1/) | 领域 benchmark 排行榜搜索 |
| [HuggingFace Datasets](https://huggingface.co/docs/datasets/) | 数据集搜索和加载 API |


================================================
FILE: docs/figure_prompts/case_a_meta_learning.md
================================================
# Case A: Continual Meta-Learning — Image Generation Prompt

## Prompt

A premium, modern data visualization infographic on a clean white background with subtle light-gray grid lines. The chart is a **line chart** showing progressive performance improvement across 5 data points on the X-axis (labeled "Self-Iteration Round").

**Overall title** at the top in bold dark navy sans-serif font: "Case A: Continual Meta-Learning for Few-Shot Adaptation"

**Y-axis:** "Few-Shot Accuracy (%)" ranging from 15% to 105%. **X-axis:** "Self-Iteration Round" with 5 labeled tick marks.

**Data points and line:**
- Point 0 (Baseline): 25.9% — large circle marker, colored **slate gray** (#757575). X-label below: "Baseline" with a small gray beaker/flask icon, subtitle "(Initial Code)".
- Point 1 (Iter 1): 81.2% — large circle marker, colored **emerald green** (#2E7D32). X-label: "Iter 1" with a small green brain/neural-network icon, subtitle "(Deep Encoder + Meta-SGD)".
- Point 2 (Iter 2): 77.5% — large circle marker, colored **crimson red** (#C62828). X-label: "Iter 2" with a small red warning-triangle icon, subtitle "(Prototype Net — Regression)".
- Point 3 (Iter 3): 93.4% — large circle marker, colored **emerald green** (#2E7D32). X-label: "Iter 3" with a small green rocket icon, subtitle "(Linear Clf + L2 Anchor)".
- Point 4 (Iter 4): 93.4% — large circle marker, colored **slate gray** (#757575). X-label: "Iter 4" with a small gray checkmark-circle icon, subtitle "(Converged)".

**Connecting line:** Thick (3px) solid line in **royal blue** (#1565C0) connecting all 5 points in order. The area below the line (above the baseline value of 25.9%) is filled with a very light semi-transparent blue wash (#1565C0 at 8% opacity).

**Annotations with callout arrows:**
- Near Point 1: A green callout box with text "+55.3 pts" in bold green, below it "Deep encoder + context-gated replay" in smaller green text. A thin green arrow points from the callout to Point 1. Include a small upward-arrow icon.
- Near Point 2: A red italic callout "Prototype net too simple" with a thin red arrow pointing to Point 2. Include a small X-mark icon.
- Near Point 3: A green callout box with text "+15.9 pts" in bold green, below it "Linear clf + L2 anchor + cosine gating" in smaller green text. A thin green arrow points from the callout to Point 3. Include a small upward-arrow icon.

**Reference line:** A horizontal **dashed orange line** (#E65100) at y=100% with a small label "Oracle (100%)" at the right end in italic orange text. Include a small trophy/target icon next to the label.

**Summary stats box:** Upper-left corner, a rounded rectangle with light blue background (#E3F2FD) and blue border (#1565C0), containing monospace text:
```
Baseline: 25.9%  →  Best: 93.4%
Improvement: +67.5 pts (261% rel.)
```

**Legend** at the bottom center with three items, each with a colored square swatch:
- Green square: "Improved"
- Red square: "Regressed (auto-recovered)"
- Gray square: "No change / Baseline"

**Style:** Clean, professional, tech-forward aesthetic. Use a modern sans-serif font (like Inter, SF Pro, or Helvetica Neue). Subtle drop shadows on the summary box and annotation callouts. Smooth anti-aliased lines. The overall feel should be suitable for a top-tier AI research company's product page or investor deck — polished, data-rich, and visually compelling. High contrast text. No 3D effects. Flat design with depth through subtle shadows and layering.

**Dimensions:** 1200 x 900 pixels, 2x retina resolution.


================================================
FILE: docs/figure_prompts/case_b_rlhf_alignment.md
================================================
# Case B: RLHF with Curriculum Reward Shaping — Image Generation Prompt

## Prompt

A premium, modern data visualization infographic on a clean white background with subtle light-gray grid lines. The chart is a **line chart with square markers** showing progressive performance improvement across 5 data points on the X-axis (labeled "Self-Iteration Round").

**Overall title** at the top in bold dark navy sans-serif font: "Case B: RLHF with Curriculum-Based Reward Shaping for LLM Alignment"

**Y-axis:** "LLM Alignment Score (%)" ranging from 15% to 105%. **X-axis:** "Self-Iteration Round" with 5 labeled tick marks.

**Data points and line:**
- Point 0 (Baseline): 35.6% — large square marker, colored **slate gray** (#757575). X-label below: "Baseline" with a small gray play-button icon, subtitle "(Vanilla PPO)".
- Point 1 (Iter 1): 35.6% — large square marker, colored **slate gray** (#757575). X-label: "Iter 1" with a small gray pause icon, subtitle "(No Change)".
- Point 2 (Iter 2): 61.6% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 2" with a small green sparkle/star icon, subtitle "(+Reward Model +Curriculum)".
- Point 3 (Iter 3): 63.0% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 3" with a small green chart-trending-up icon, subtitle "(+Rank-Norm +Policy EMA)".
- Point 4 (Iter 4): 66.6% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 4" with a small green shield-check icon, subtitle "(+Confidence Gating)".

**Connecting line:** Thick (3px) solid line in **deep purple** (#6A1B9A) connecting all 5 points in order. The area below the line (above the baseline value of 35.6%) is filled with a very light semi-transparent purple wash (#6A1B9A at 8% opacity).

**Annotations with callout arrows:**
- Near Point 1: A gray italic callout "No improvement (minor code fix)" with a thin gray arrow pointing down to Point 1. Include a small minus-circle icon.
- Near Point 2: A green callout box with text "+26.0 pts" in bold green, below it "+Learned reward model" and "+Curriculum scheduling" in smaller green text. A thin green arrow points from the callout to Point 2. Include a small upward-arrow icon and a tiny brain icon.
- Near Point 3: A smaller green callout with text "+1.4 pts" in green, below it "+Rank-norm +Policy EMA" in smaller text. A thin green arrow points to Point 3. Include a small upward-arrow icon.
- Near Point 4: A green callout box with text "+3.6 pts" in bold green, below it "+Confidence gating" and "+Mini-batch RM" in smaller green text. A thin green arrow points to Point 4. Include a small upward-arrow icon and a tiny lock/shield icon.

**Summary stats box:** Upper-left corner, a rounded rectangle with light purple background (#F3E5F5) and purple border (#6A1B9A), containing monospace text:
```
Baseline: 35.6%  →  Best: 66.6%
Improvement: +31.0 pts (87% rel.)
```

**Legend** at the bottom center with three items, each with a colored square swatch:
- Green square: "Improved"
- Red square: "Regressed (auto-recovered)"
- Gray square: "No change / Baseline"

**Style:** Clean, professional, tech-forward aesthetic. Use a modern sans-serif font (like Inter, SF Pro, or Helvetica Neue). Subtle drop shadows on the summary box and annotation callouts. Smooth anti-aliased lines. The overall feel should be suitable for a top-tier AI research company's product page or investor deck — polished, data-rich, and visually compelling. High contrast text. No 3D effects. Flat design with depth through subtle shadows and layering.

**Dimensions:** 1200 x 900 pixels, 2x retina resolution.


================================================
FILE: docs/integration-guide.md
================================================
# AutoResearchClaw Integration Guide

> **The simplest way to use AutoResearchClaw**: give the repo URL to [OpenClaw](https://github.com/openclaw/openclaw) and say *"Research [your topic]."* That's it — OpenClaw handles cloning, installing, configuring, and running the entire 23-stage pipeline for you.

This guide is for humans who want to understand what's happening under the hood, or who prefer to set things up manually.

---

## Table of Contents

1. [The Easy Way: OpenClaw](#1-the-easy-way-openclaw)
2. [Manual Setup](#2-manual-setup)
3. [Configuration Walkthrough](#3-configuration-walkthrough)
4. [Running the Pipeline](#4-running-the-pipeline)
5. [Understanding the 23 Stages](#5-understanding-the-23-stages)
6. [Output Artifacts](#6-output-artifacts)
7. [Experiment Modes](#7-experiment-modes)
8. [Conference Templates](#8-conference-templates)
9. [OpenClaw Bridge (Advanced)](#9-openclaw-bridge-advanced)
10. [MetaClaw Integration (Cross-Run Learning)](#10-metaclaw-integration-cross-run-learning)
11. [Other AI Platforms](#11-other-ai-platforms)
12. [Python API](#12-python-api)
13. [Troubleshooting](#13-troubleshooting)
14. [FAQ](#14-faq)

---

## 1. The Easy Way: OpenClaw

If you use [OpenClaw](https://github.com/openclaw/openclaw) as your AI assistant, you don't need to read the rest of this guide.

### Steps

1. Share the GitHub repo URL with OpenClaw:
   ```
   https://github.com/aiming-lab/AutoResearchClaw
   ```
2. OpenClaw reads `RESEARCHCLAW_AGENTS.md` and `README.md` — it now understands the entire system.
   > **Note:** `RESEARCHCLAW_AGENTS.md` is generated locally and listed in `.gitignore`. If it doesn't exist, OpenClaw can bootstrap from `README.md` and the project structure.
3. Say something like:
   ```
   Research the application of graph neural networks in drug discovery
   ```
4. OpenClaw will:
   - Clone the repo
   - Create a virtual environment and install dependencies (`pip install -e .`)
   - Copy `config.researchclaw.example.yaml` → `config.yaml`
   - Ask you for an OpenAI API key (or use your environment variable)
   - Run the full 23-stage pipeline
   - Return the paper, experiment code, charts, and citations

**That's the whole process.** OpenClaw is designed to read agent definition files and bootstrap itself. AutoResearchClaw ships with these files specifically so that any OpenClaw-compatible AI assistant can pick it up and run.

### What if I want to tweak settings?

Tell OpenClaw in natural language:

- *"Use GPT-5.2 instead of GPT-4o"*
- *"Run experiments in sandbox mode, not simulated"*
- *"Target ICLR 2025 format instead of NeurIPS"*
- *"Skip the quality gate, just auto-approve everything"*

OpenClaw will modify `config.yaml` accordingly before running the pipeline.

---

## 2. Manual Setup

### Prerequisites

| Requirement | Details |
|-------------|---------|
| **Python** | 3.11 or newer |
| **LLM API** | Any OpenAI-compatible endpoint (OpenAI, Azure, local proxy, etc.) |
| **Disk space** | ~100 MB for the repo + artifacts per run |
| **Network** | Required for LLM API calls and literature search (Semantic Scholar, arXiv) |

### Installation

```bash
# Clone the repository
git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw

# Create a virtual environment (recommended)
python3 -m venv .venv
source .venv/bin/activate    # macOS/Linux
# .venv\Scripts\activate     # Windows

# Install
pip install -e .
```

### Verify Installation

```bash
# Check the CLI is available
researchclaw --help

# Validate your configuration
researchclaw validate --config config.yaml
```

---

## 3. Configuration Walkthrough

Start from the provided template:

```bash
cp config.researchclaw.example.yaml config.yaml
```

Open `config.yaml` in your editor. Here's what each section does:

### LLM Settings (Required)

This is the only section you **must** configure. Everything else has sensible defaults.

```yaml
llm:
  base_url: "https://api.openai.com/v1"     # Your LLM API endpoint
  api_key_env: "OPENAI_API_KEY"              # Environment variable name...
  api_key: ""                                # ...or paste the key directly here
  primary_model: "gpt-4o"                    # Model to use (gpt-4o, gpt-5.2, etc.)
  fallback_models:                           # Tried in order if primary fails
    - "gpt-4.1"
    - "gpt-4o-mini"
  s2_api_key: ""                             # Optional: Semantic Scholar API key for higher rate limits
```

**Using an environment variable** (recommended for security):
```bash
export OPENAI_API_KEY="sk-..."
```

**Using a direct key** (simpler, less secure):
```yaml
llm:
  api_key: "sk-your-key-here"
```

**Using a proxy or alternative provider**:
```yaml
llm:
  base_url: "https://your-proxy.example.com/v1"
  api_key: "your-proxy-key"
  primary_model: "gpt-4o"    # Must be supported by your endpoint
```

### Research Settings

```yaml
research:
  topic: "Your research topic here"    # Can also be set via CLI --topic flag
  domains:
    - "machine-learning"               # Guides literature search scope
  daily_paper_count: 10                # Target papers to collect
  quality_threshold: 4.0               # Minimum paper quality score (1-5)
```

### Experiment Settings

```yaml
experiment:
  mode: "sandbox"              # How experiments run (see Section 7)
  time_budget_sec: 300         # Max seconds per experiment run
  max_iterations: 10           # Max refinement loops in Stage 13
  metric_key: "primary_metric" # What metric to optimize
  metric_direction: "minimize" # "minimize" or "maximize"
  sandbox:
    python_path: ".venv/bin/python3"   # Python binary for sandbox execution
    gpu_required: false
    max_memory_mb: 4096
  code_agent:                        # CodeAgent v2 (multi-phase code generation)
    enabled: true                    # Architecture planning + sequential file gen + hard validation
  benchmark_agent:                   # Automated dataset & baseline selection
    enabled: true                    # 4-agent pipeline: Surveyor→Selector→Acquirer→Validator
  figure_agent:                      # Academic figure generation
    enabled: true                    # 5-agent pipeline: Planner→CodeGen→Renderer→Critic→Integrator
  repair:                            # Anti-fabrication experiment repair
    enabled: true                    # Diagnose and fix failed experiments before paper writing
    max_cycles: 3                    # Repair retry loops
  opencode:                          # OpenCode Beast Mode (see README for details)
    enabled: true
```

### Export Settings

```yaml
export:
  target_conference: "neurips_2025"   # See Section 8 for all available templates
  authors: "Anonymous"                 # Author line in the paper
  bib_file: "references"              # BibTeX file name (without .bib)
```

### Everything Else (Optional)

These have reasonable defaults. Change them only if you need to:

```yaml
project:
  name: "my-research"      # Just an identifier for your run
  mode: "full-auto"         # "docs-first", "semi-auto", or "full-auto"

runtime:
  timezone: "America/New_York"
  max_parallel_tasks: 3
  approval_timeout_hours: 12
  retry_limit: 2

security:
  hitl_required_stages: [5, 9, 20]     # Stages that pause for human approval
  allow_publish_without_approval: false

notifications:
  channel: "console"        # "console", "discord", or "slack"

knowledge_base:
  backend: "markdown"
  root: "docs/kb"
```

---

## 4. Running the Pipeline

### Basic Run

```bash
# Run with topic from config.yaml
researchclaw run --config config.yaml --auto-approve

# Override topic from command line
researchclaw run --config config.yaml --topic "Transformer attention for time series" --auto-approve
```

### CLI Commands

| Command | What It Does |
|---------|-------------|
| `researchclaw setup` | Interactive first-time setup (installs OpenCode Beast Mode, checks Docker/LaTeX) |
| `researchclaw init` | Interactive config creation (choose LLM provider, creates `config.arc.yaml`) |
| `researchclaw run` | Run the full 23-stage pipeline |
| `researchclaw validate` | Check your config file for errors |
| `researchclaw doctor` | Diagnose environment issues (Python, dependencies, API connectivity) |
| `researchclaw report --run-dir <path>` | Generate a human-readable summary of a completed run |

### Run Flags

| Flag | Effect |
|------|--------|
| `--topic "..."` | Override the topic in config.yaml |
| `--config path` | Config file path (default: `config.yaml`) |
| `--output path` | Output directory (default: `artifacts/<run-id>/`) |
| `--auto-approve` | Skip manual approval at gate stages (5, 9, 20) |
| `--from-stage STAGE_NAME` | Start from a specific stage (e.g., `PAPER_OUTLINE`) |
| `--resume` | Resume from the last checkpoint (auto-detects the most recent run matching your topic) |
| `--skip-preflight` | Skip LLM connectivity check before starting |
| `--skip-noncritical-stage` | Skip non-critical stages on failure instead of aborting |
| `--no-graceful-degradation` | Fail pipeline on quality gate failure instead of degrading gracefully |

### Examples

```bash
# Full autonomous run — no human intervention
researchclaw run -c config.yaml -t "Graph neural networks for protein folding" --auto-approve

# Resume a failed run from where it stopped
researchclaw run -c config.yaml --resume --auto-approve

# Re-run just the paper writing stages
researchclaw run -c config.yaml --from-stage PAPER_OUTLINE --auto-approve

# Check your setup before running
researchclaw doctor -c config.yaml
```

---

## 5. Understanding the 23 Stages

The pipeline runs in 8 phases. Each stage reads artifacts from previous stages and produces new ones.

### Phase A: Research Scoping

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 1 | TOPIC_INIT | LLM formulates a SMART research goal; auto-detects GPU hardware (NVIDIA/MPS/CPU) | `goal.md`, `hardware_profile.json` |
| 2 | PROBLEM_DECOMPOSE | Breaks the goal into prioritized sub-questions | `problem_tree.md` |

### Phase B: Literature Discovery

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 3 | SEARCH_STRATEGY | Plans search queries and data sources | `search_plan.yaml`, `sources.json` |
| 4 | LITERATURE_COLLECT | Queries **real APIs** (arXiv-first, then Semantic Scholar) with expanded queries for broad coverage | `candidates.jsonl` |
| 5 | LITERATURE_SCREEN | **[Gate]** Filters by relevance and quality | `shortlist.jsonl` |
| 6 | KNOWLEDGE_EXTRACT | Extracts structured knowledge cards from each paper | `cards/` |

### Phase C: Knowledge Synthesis

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 7 | SYNTHESIS | Clusters findings, identifies research gaps | `synthesis.md` |
| 8 | HYPOTHESIS_GEN | Generates falsifiable hypotheses | `hypotheses.md` |

### Phase D: Experiment Design

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 9 | EXPERIMENT_DESIGN | **[Gate]** Designs experiment plan with baselines and metrics | `exp_plan.yaml` |
| 10 | CODE_GENERATION | LLM writes hardware-aware experiment code (adapts packages/constraints to GPU tier) | `experiment.py`, `experiment_spec.md` |
| 11 | RESOURCE_PLANNING | Estimates GPU/time requirements | `schedule.json` |

### Phase E: Experiment Execution

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 12 | EXPERIMENT_RUN | Runs the experiment code (sandbox or simulated); immutable harness injected for time guard and metric validation; partial results captured on timeout | `runs/` |
| 13 | ITERATIVE_REFINE | LLM analyzes results, improves code, re-runs (up to 10 iterations); timeout-aware prompts; NaN/divergence fast-fail; stdout truncated for context efficiency | `refinement_log.json`, `experiment_final.py` |

### Phase F: Analysis & Decision

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 14 | RESULT_ANALYSIS | Statistical analysis of experiment results | `analysis.md` |
| 15 | RESEARCH_DECISION | PROCEED / PIVOT decision with evidence | `decision.md` |

### Phase G: Paper Writing

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 16 | PAPER_OUTLINE | Creates section-level paper outline | `outline.md` |
| 17 | PAPER_DRAFT | Writes paper section-by-section (3 LLM calls, 5,000-6,500 words); **hard-blocked when no experiment metrics** (anti-fabrication); conference-grade title guidelines and abstract structure injected | `paper_draft.md` |
| 18 | PEER_REVIEW | Simulates 2+ reviewer perspectives with NeurIPS/ICML rubric (1-10 scoring); checks baselines, ablations, claims vs evidence | `reviews.md` |
| 19 | PAPER_REVISION | Addresses review comments with length guard (auto-retries if revised paper is shorter than draft) | `paper_revised.md` |

### Phase H: Finalization

| # | Stage | What Happens | Produces |
|---|-------|-------------|----------|
| 20 | QUALITY_GATE | **[Gate]** Checks paper quality score | `quality_report.json` |
| 21 | KNOWLEDGE_ARCHIVE | Saves retrospective + reproducibility bundle | `archive.md`, `bundle_index.json` |
| 22 | EXPORT_PUBLISH | Generates LaTeX, charts, and code package | `paper_final.md`, `paper.tex`, `code/` |
| 23 | CITATION_VERIFY | Fact-checks all references against real APIs | `verification_report.json`, `references_verified.bib` |

### Gate Stages

Three stages pause for human review (unless `--auto-approve` is set):

| Gate | What's Being Reviewed | On Reject, Rolls Back To |
|------|-----------------------|--------------------------|
| Stage 5 | Are the collected papers relevant and sufficient? | Stage 4 (re-collect literature) |
| Stage 9 | Is the experiment design sound? | Stage 8 (re-generate hypotheses) |
| Stage 20 | Does the paper meet quality standards? | Stage 16 (re-write from outline) |

For fully autonomous operation, always use `--auto-approve`.

---

## 6. Output Artifacts

Each run creates a timestamped directory under `artifacts/`:

```
artifacts/rc-20260310-143200-a1b2c3/
├── stage-1/goal.md                        # Research goal
├── stage-2/problem_tree.md                # Problem decomposition
├── stage-3/search_plan.yaml               # Search strategy
├── stage-4/candidates.jsonl               # Raw literature results
├── stage-5/shortlist.jsonl                # Screened papers
├── stage-6/cards/                         # Knowledge cards (one per paper)
├── stage-7/synthesis.md                   # Research gap analysis
├── stage-8/hypotheses.md                  # Research hypotheses
├── stage-9/exp_plan.yaml                  # Experiment plan
├── stage-10/experiment.py                 # Generated experiment code
├── stage-10/experiment_spec.md            # Experiment specification
├── stage-11/schedule.json                 # Resource schedule
├── stage-12/runs/run-1.json               # Experiment results
├── stage-13/experiment_final.py           # Refined experiment code
├── stage-13/experiment_v1.py              # Iteration 1 snapshot
├── stage-13/refinement_log.json           # Refinement history
├── stage-14/analysis.md                   # Statistical analysis
├── stage-14/experiment_summary.json       # Metrics summary
├── stage-15/decision.md                   # Proceed/Pivot decision
├── stage-16/outline.md                    # Paper outline
├── stage-17/paper_draft.md                # Full paper draft
├── stage-18/reviews.md                    # Simulated peer reviews
├── stage-19/paper_revised.md              # Revised paper
├── stage-20/quality_report.json           # Quality assessment
├── stage-21/archive.md                    # Knowledge retrospective
├── stage-22/
│   ├── paper_final.md                     # Final paper (Markdown)
│   ├── paper.tex                          # Conference-ready LaTeX
│   ├── references.bib                     # BibTeX references
│   ├── charts/                            # Result visualizations
│   └── code/                              # Open-source code package
│       ├── experiment.py
│       ├── requirements.txt
│       └── README.md
├── stage-23/
│   ├── verification_report.json           # Citation fact-check results
│   └── references_verified.bib            # Cleaned bibliography
└── pipeline_summary.json                  # Overall execution summary
```

### Key Output Files

| File | What You'll Use It For |
|------|----------------------|
| `stage-22/paper.tex` | Submit to a conference (compile with `pdflatex` or `tectonic`) |
| `stage-22/paper_final.md` | Read or further edit the paper |
| `stage-22/references.bib` | Bibliography for LaTeX compilation |
| `stage-22/code/` | Share experiment code alongside the paper |
| `stage-23/verification_report.json` | Check which citations are real vs. hallucinated |
| `stage-13/experiment_final.py` | The best-performing experiment code |
| `stage-22/charts/` | Figures for the paper |

---

## 7. Experiment Modes

AutoResearchClaw supports four modes for running experiments:

### Simulated (Default)

```yaml
experiment:
  mode: "simulated"
```

The LLM **generates synthetic experiment results** without executing any code. This is fast and requires no special setup, but the results are not real.

**Best for**: Quick prototyping, testing the pipeline end-to-end, environments without Python scientific packages.

### Sandbox

```yaml
experiment:
  mode: "sandbox"
  sandbox:
    python_path: ".venv/bin/python3"
    gpu_required: false
    max_memory_mb: 4096
```

The pipeline **generates Python code and actually runs it** in a subprocess. The code is validated before execution (AST parsing, import whitelist, no file I/O outside sandbox). **Hardware-aware**: Stage 1 auto-detects your GPU (NVIDIA CUDA / Apple MPS / CPU-only) and adapts the generated code accordingly — high-tier GPUs get full PyTorch code, limited GPUs get lightweight experiments, CPU-only gets NumPy/sklearn only.

**Best for**: Real experiments on your local machine. Supports numpy and stdlib; deep learning frameworks (torch, tensorflow) are available if installed in your environment and GPU is detected.

**Safety features**:
- Code validation blocks dangerous operations (subprocess, eval, exec, network calls)
- Configurable memory limit and execution timeout
- Auto-repair: if generated code has validation errors, the LLM fixes them (up to 3 attempts)

### Docker

```yaml
experiment:
  mode: "docker"
  docker:
    image: "researchclaw/experiment:latest"
    gpu_enabled: true
    memory_limit_mb: 8192
    network_policy: "setup_only"   # none | setup_only | pip_only | full
    auto_install_deps: true
    shm_size_mb: 2048
```

The pipeline runs generated code inside a **Docker container** with GPU passthrough, dependency auto-installation, and network isolation. Execution follows a **three-phase model** within a single container:

1. **Phase 0 (pip install)**: Installs auto-detected dependencies from `requirements.txt` (network enabled)
2. **Phase 1 (setup.py)**: Runs `setup.py` for dataset downloads and environment preparation (network enabled)
3. **Phase 2 (experiment)**: Executes the experiment code (network disabled by default via iptables)

**Network policies**:
- `none` — No network at all (all phases offline). Requires all deps pre-installed in image.
- `setup_only` (default) — Network during Phase 0+1, disabled before Phase 2 via iptables (`--cap-add=NET_ADMIN`).
- `pip_only` — Network only during Phase 0 (pip install), disabled for Phase 1+2.
- `full` — Network available throughout all phases.

**Pre-cached datasets**: The Docker image includes CIFAR-10/100, MNIST, FashionMNIST, STL-10, and SVHN at `/opt/datasets`, mounted read-only as `/workspace/data`. No download needed for these standard benchmarks.

**Best for**: Reproducible experiments with full dependency isolation. Supports GPU passthrough (NVIDIA) and configurable network policies.

**Setup**: Build the image first:
```bash
docker build -t researchclaw/experiment:latest researchclaw/docker/
```

### SSH Remote

```yaml
experiment:
  mode: "ssh_remote"
  ssh_remote:
    host: "gpu-server.example.com"
    gpu_ids: [0, 1]
    remote_workdir: "/tmp/researchclaw_experiments"
```

The pipeline sends generated code to a remote GPU server for execution.

**Best for**: Experiments that require GPU hardware you don't have locally.

---

## 8. Conference Templates

AutoResearchClaw generates LaTeX files formatted for specific conferences:

```yaml
export:
  target_conference: "neurips_2025"
```

| Conference | Config Value | Layout |
|------------|-------------|--------|
| NeurIPS 2025 | `neurips_2025` (default) | Single-column, `neurips_2025` style |
| NeurIPS 2024 | `neurips_2024` | Single-column, `neurips_2024` style |
| ICLR 2026 | `iclr_2026` | Single-column, `iclr2026_conference` style |
| ICLR 2025 | `iclr_2025` | Single-column, `iclr2025_conference` style |
| ICML 2026 | `icml_2026` | Double-column, `icml2026` style |
| ICML 2025 | `icml_2025` | Double-column, `icml2025` style |

Short aliases are also accepted: `neurips` (→ 2025), `iclr` (→ 2026), `icml` (→ 2026).

The Markdown-to-LaTeX converter handles:
- Section headings (`#`, `##`, `###`)
- Inline and display math (`$...$`, `$$...$$`)
- Bold and italic text
- Ordered and unordered lists
- Tables
- Code blocks
- Citation references (`[cite_key]` → `\cite{cite_key}`)

### Compiling the LaTeX

```bash
# Using tectonic (recommended)
tectonic artifacts/<run-id>/stage-22/paper.tex

# Using pdflatex
cd artifacts/<run-id>/stage-22/
pdflatex paper.tex
bibtex paper
pdflatex paper.tex
pdflatex paper.tex
```

---

## 9. OpenClaw Bridge (Advanced)

For deeper integration with OpenClaw, AutoResearchClaw includes a bridge adapter system. Each flag in the config activates a typed protocol interface:

```yaml
openclaw_bridge:
  use_cron: true              # Scheduled research runs
  use_message: true           # Progress notifications (Discord/Slack/Telegram)
  use_memory: true            # Cross-session knowledge persistence
  use_sessions_spawn: true    # Spawn parallel sub-sessions for concurrent stages
  use_web_fetch: true         # Live web search during literature review
  use_browser: false          # Browser-based paper collection
```

### What Each Adapter Does

| Adapter | Protocol | Use Case |
|---------|----------|----------|
| **Cron** | `CronAdapter.schedule_resume(run_id, stage_id, reason)` | Schedule pipeline resumption (e.g., daily re-runs) |
| **Message** | `MessageAdapter.notify(channel, subject, body)` | Send progress updates to chat platforms |
| **Memory** | `MemoryAdapter.append(namespace, content)` | Persist knowledge across sessions |
| **Sessions** | `SessionsAdapter.spawn(name, command)` | Run pipeline stages in parallel sub-sessions |
| **WebFetch** | `WebFetchAdapter.fetch(url)` | Fetch web pages during literature search |
| **Browser** | `BrowserAdapter.open(url)` | Open and interact with web pages |

When OpenClaw provides a capability (e.g., message sending), the adapter consumes it automatically. When running standalone, recording stubs capture all calls for debugging without side effects.

This is an **extension point** — you don't need to configure it for basic usage.

---

## 10. MetaClaw Integration (Cross-Run Learning)

[MetaClaw](https://github.com/aiming-lab/MetaClaw) adds **cross-run knowledge transfer** to AutoResearchClaw. When enabled, the pipeline automatically captures lessons from failures and converts them into reusable skills that improve subsequent runs.

### Architecture

```
┌──────────────────────────────────────────────────────┐
│              AutoResearchClaw Pipeline                │
│  Stage 1 → 2 → ... → 23                             │
│                                                      │
│  ┌─────────────┐    ┌──────────────────────────────┐ │
│  │ LLMClient   │───▶│ MetaClaw Integration Layer   │ │
│  │             │    │ (metaclaw_bridge module)      │ │
│  └─────────────┘    └──────────┬───────────────────┘ │
│                                │                     │
│  ┌─────────────┐    ┌──────────▼───────────────────┐ │
│  │ Evolution   │◀──▶│ Lesson ↔ Skill Bridge        │ │
│  │ Store       │    └─────────────────────────────┘ │
│  └─────────────┘                                     │
└──────────────────────────┬───────────────────────────┘
                           │
            ┌──────────────▼──────────────┐
            │     MetaClaw Proxy Server    │
            │     (optional, :30000)       │
            │  ┌────────────────────────┐  │
            │  │ SkillManager (40+ skills)│ │
            │  │ + arc-* learned skills   │ │
            │  └────────────────────────┘  │
            └─────────────────────────────┘
```

### How It Works

1. **Lesson Capture**: During each pipeline run, the `EvolutionStore` automatically records failures, warnings, and anomalies as structured lessons in `evolution/lessons.jsonl`.

2. **Lesson → Skill Conversion**: After a run completes, lessons above a configurable severity threshold are converted into `arc-*` skill files stored in `~/.metaclaw/skills/`. Each skill contains: trigger conditions, failure root cause, and actionable guidance.

3. **Skill Injection**: On the next run, `build_overlay()` reads all `arc-*` skills and injects them into the LLM prompt for every stage via the `evolution_overlay` parameter. The LLM receives explicit instructions to avoid previously encountered pitfalls.

4. **Proxy Routing (Optional)**: When the MetaClaw proxy is running, LLM requests are routed through it for additional skill matching and session tracking. If the proxy is unavailable, requests automatically fall back to the direct LLM endpoint.

### Setup

#### Step 1: Install MetaClaw

```bash
pip install metaclaw
# Or clone from source:
git clone https://github.com/aiming-lab/MetaClaw.git
cd metaclaw && pip install -e .
```

#### Step 2: Configure

Add the `metaclaw_bridge` section to your `config.arc.yaml`:

```yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000/v1"    # MetaClaw proxy (optional)
  skills_dir: "~/.metaclaw/skills"          # Skill storage directory
  fallback_url: "https://api.openai.com/v1" # Direct LLM fallback
  fallback_api_key_env: "OPENAI_API_KEY"
  lesson_to_skill:
    enabled: true
    min_severity: "warning"                 # Convert warnings + errors
    max_skills_per_run: 5                   # Max new skills per run
```

#### Step 3: Run

```bash
# First run — captures lessons, generates initial skills
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve

# Check generated skills
ls ~/.metaclaw/skills/arc-*/SKILL.md

# Second run — skills from Run 1 are automatically injected
researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve
```

#### Optional: Start MetaClaw Proxy

For full skill matching and session tracking:

```bash
metaclaw start --mode skills_only --port 30000
# Or use the provided script:
bash scripts/metaclaw_start.sh
```

The proxy is optional — without it, the pipeline still benefits from skill injection via `build_overlay()` and falls back to your configured LLM endpoint.

### Experiment Results

In controlled A/B experiments (same topic, same LLM, same configuration):

| Metric | Baseline | With MetaClaw | Improvement |
|--------|----------|---------------|-------------|
| Stage retry rate | 10.5% | 7.9% | **-24.8%** |
| Refine cycle count | 2.0 | 1.2 | **-40.0%** |
| Pipeline stage completion | 18/19 | 19/19 | **+5.3%** |
| Overall robustness score (composite) | 0.714 | 0.845 | **+18.3%** |

> Composite robustness score is a weighted average of stage completion rate (40%), retry reduction (30%), and refine cycle efficiency (30%).

### Key Files

| File | Purpose |
|------|---------|
| `researchclaw/metaclaw_bridge/` | Integration module (config, session, lesson_to_skill, prm_gate, skill_feedback) |
| `researchclaw/evolution.py` | `build_overlay()` — reads intra-run lessons + cross-run arc-* skills |
| `researchclaw/llm/client.py` | Proxy routing with automatic fallback |
| `~/.metaclaw/skills/arc-*/SKILL.md` | Learned skill files (auto-generated) |
| `scripts/metaclaw_start.sh` | Helper script to launch MetaClaw proxy |

### Backward Compatibility

- **Default: OFF.** Without `metaclaw_bridge.enabled: true`, the pipeline is completely unchanged.
- **No new required dependencies.** MetaClaw is optional.
- **All 1,823 existing tests pass** with the integration code.

---

## 11. Other AI Platforms

AutoResearchClaw works with any AI coding assistant that can read project context files.

### Claude Code

Claude Code automatically reads `RESEARCHCLAW_CLAUDE.md` (if present) when you open the project. It also loads the skill definition from `.claude/skills/researchclaw/SKILL.md`.

> **Note:** `RESEARCHCLAW_CLAUDE.md` is generated locally and listed in `.gitignore`. The `.claude/skills/researchclaw/SKILL.md` file is always available in the repo.

```
You: Research the impact of attention mechanisms on speech recognition
Claude: [Reads project context, runs the pipeline, returns results]
```

### Copilot CLI (GitHub)

GitHub Copilot can be used as an ACP agent via the `gh` CLI command (GitHub CLI with Copilot extension). Set the ACP agent to `gh` in your config:

```yaml
llm:
  provider: "acp"
  acp:
    agent: "gh"
    cwd: "."
```

Prerequisites:
1. Install [GitHub CLI](https://cli.github.com/) (`gh`)
2. Install the Copilot extension: `gh extension install github/gh-copilot`
3. Authenticate: `gh auth login`

### OpenCode

OpenCode loads skills from `.claude/skills/`. The `researchclaw` skill activates on research-related queries and guides the agent through the pipeline.

### Any AI CLI

Provide `RESEARCHCLAW_AGENTS.md` (if generated locally) or `README.md` as context to any AI assistant. `RESEARCHCLAW_AGENTS.md` contains:
- The agent role definition (research orchestrator)
- Quick setup instructions
- Pipeline stage reference
- Decision guide for common scenarios

The agent reads this file and knows how to install, configure, and run the pipeline. If the file is not present, the `README.md` and `.claude/skills/researchclaw/SKILL.md` provide sufficient context for any AI assistant to operate the pipeline.

---

## 12. Python API

For programmatic use or custom integrations:

```python
from researchclaw.pipeline.runner import execute_pipeline
from researchclaw.config import RCConfig
from researchclaw.adapters import AdapterBundle
from pathlib import Path

# Load configuration
config = RCConfig.load("config.yaml", check_paths=False)

# Run the full pipeline
results = execute_pipeline(
    run_dir=Path("artifacts/my-run"),
    run_id="run-001",
    config=config,
    adapters=AdapterBundle(),
    auto_approve_gates=True,
)

# Check results
for result in results:
    print(f"Stage {result.stage.name}: {result.status.value}")
```

### Iterative Pipeline (Multiple Paper Revisions)

```python
from researchclaw.pipeline.runner import execute_iterative_pipeline

results = execute_iterative_pipeline(
    run_dir=Path("artifacts/my-run"),
    run_id="run-001",
    config=config,
    adapters=AdapterBundle(),
    max_iterations=3,       # Re-run paper writing up to 3 times
    convergence_rounds=2,   # Stop if quality stabilizes for 2 rounds
)
```

### Literature Search Only

```python
from researchclaw.literature.search import search_papers

papers = search_papers("transformer attention mechanisms", limit=20)
for p in papers:
    print(f"{p.title} ({p.year}) — cited {p.citation_count}x")
    print(p.to_bibtex())
```

---

## 13. Troubleshooting

### Pre-Run Diagnostics

```bash
# Check everything: Python version, dependencies, API connectivity, config validity
researchclaw doctor --config config.yaml
```

### Common Issues

| Problem | Cause | Solution |
|---------|-------|----------|
| `Missing required field: llm.base_url` | Config incomplete | Set `llm.base_url` and `llm.api_key` (or `api_key_env`) |
| `Config validation FAILED` | Invalid YAML or missing fields | Run `researchclaw validate -c config.yaml` for details |
| `Preflight check... FAILED` | LLM API unreachable | Check `base_url`, API key, and network connectivity |
| Sandbox execution fails | Python path wrong or missing packages | Verify `experiment.sandbox.python_path` exists; ensure numpy is installed |
| Code validation rejects all attempts | LLM generates unsafe code | Switch to `simulated` mode, or try a more capable model |
| Gate stage blocks pipeline | Manual approval required | Use `--auto-approve` for autonomous mode |
| Pipeline fails mid-run | Transient API error | Run with `--resume` to continue from the last checkpoint |
| Citations marked HALLUCINATED | LLM invented fake references | This is expected — Stage 23 catches these. Use `references_verified.bib` instead |
| LaTeX won't compile | Missing style packages | Install the conference style files, or use `tectonic` which auto-downloads them |

### Resuming a Failed Run

```bash
# Resume from the exact point of failure
researchclaw run -c config.yaml --resume --auto-approve

# Or restart from a specific stage
researchclaw run -c config.yaml --from-stage EXPERIMENT_RUN --auto-approve --output artifacts/<run-id>
```

### Reading a Run Report

```bash
researchclaw report --run-dir artifacts/rc-20260310-143200-a1b2c3
```

This prints a human-readable summary: which stages passed, which failed, key metrics, and paper quality scores.

---

## 14. FAQ

**Q: How much does a full pipeline run cost in API credits?**
A: Depends on your model and topic complexity. A typical run with GPT-4o makes ~35-60 API calls across all 23 stages (paper drafting now uses 3 sequential calls for section-by-section writing). Expect roughly $3-12 per run. Simulated mode uses slightly fewer tokens since it doesn't generate real experiment code.

**Q: Can I use a local LLM (Ollama, vLLM, etc.)?**
A: Yes — any OpenAI-compatible endpoint works. Set `llm.base_url` to your local server (e.g., `http://localhost:11434/v1` for Ollama). Quality depends heavily on the model's capabilities.

**Q: Can I run only part of the pipeline?**
A: Yes. Use `--from-stage STAGE_NAME` to start from any stage. The stage reads its inputs from previously generated artifacts, so the earlier stages must have completed at least once.

**Q: Are the literature references real?**
A: Yes. Stage 4 uses a multi-source strategy (arXiv-first, then Semantic Scholar) with query expansion to find real papers with real titles, DOIs, and citation counts. The pipeline typically collects 100-200 candidates and aims for 30-60 references in the final paper. Stage 23 then verifies every reference to catch any that the LLM might have hallucinated during paper writing.

**Q: Can I use this for a real paper submission?**
A: AutoResearchClaw is a research tool, not a paper mill. The output is a strong first draft that should be reviewed, improved, and validated by a human researcher before submission. Think of it as an extremely thorough research assistant.

**Q: What happens if the LLM API goes down mid-run?**
A: The pipeline checkpoints after every stage. Use `--resume` to pick up where it left off. Failed stages are retried according to the `max_retries` setting in each stage's contract.

**Q: Can I change the research topic mid-run?**
A: Not recommended — the pipeline builds on prior stages' outputs. Start a new run with the new topic instead.

---

*Last updated: March 2026 · AutoResearchClaw v0.3.1+*


================================================
FILE: docs/issue_tracker_v9.md
================================================
# AutoResearchClaw — Issue Tracker v9

> Created: 2026-03-15
> Status: **Active** — tracking all known issues from Phase 0-3 regression tests
> Covers: Run 7-13 findings, V8 merge improvements, upstream sync

---

## Issue Summary

| Category | Total | Fixed | Partial | Open |
|----------|-------|-------|---------|------|
| LaTeX & Title | 5 | 5 | 0 | 0 |
| Experiment Quality | 6 | 6 | 0 | 0 |
| Code Generation | 4 | 4 | 0 | 0 |
| Writing Quality | 5 | 5 | 0 | 0 |
| Literature & Citations | 4 | 4 | 0 | 0 |
| Infrastructure (Docker) | 5 | 5 | 0 | 0 |
| Pipeline Logic | 3 | 3 | 0 | 0 |
| New Feature Requests | 2 | 1 | 0 | 1 |
| Run 13 Findings | 3 | 3 | 0 | 0 |
| **Total** | **37** | **36** | **0** | **1** |

---

## 1. LaTeX & Title Issues

### I-01: Title extraction fails on `##` headings (FIXED)
- **Severity**: High
- **Status**: FIXED — v9 patch
- **Root Cause**: `_extract_paper_title()` in `executor.py:242` only matched `# ` (H1). When LLM generates `## Title ...` (H2), no candidates were found → returned `"Untitled Paper"`.
- **Affected Runs**: Run 12 (`\title{Untitled Paper}`)
- **Files**:
  - `researchclaw/pipeline/executor.py:240-253` — regex now matches `#{1,2}`, strips "Title " prefix
  - `researchclaw/templates/converter.py:429-451` — `_extract_title()` now handles level 1 and 2
- **Fix**: Added H2 fallback; handles `## Title <actual title>` pattern by stripping literal "Title " prefix.

### I-02: Converter `_extract_title` also level-1 only (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9 patch (same fix as I-01)
- **File**: `researchclaw/templates/converter.py:434,442,447` — `sec.level in (1, 2)` + "Title " prefix strip
- **Note**: Both I-01 and I-02 fixed together.

### I-03: LaTeX outer fence not stripped (FIXED)
- **Severity**: High
- **Status**: FIXED — commit `3792fd6`
- **File**: `converter.py:107-117`
- **Fix**: Greedy regex + boundary strip

### I-04: Metric values 16 decimal places (FIXED)
- **Severity**: Medium
- **Status**: FIXED — commit `3792fd6`
- **File**: `converter.py:119-133`
- **Fix**: `_round_raw_metrics()` rounds to 4 places

### I-05: Duplicate tables in LaTeX output (FIXED)
- **Severity**: Medium
- **Status**: FIXED — IMP-30, commit `b88aba2`
- **File**: `converter.py:542-575`
- **Fix**: `_deduplicate_tables()` by header row matching

---

## 2. Experiment Quality Issues

### I-06: Experiments only run n=1 seeds (FIXED)
- **Severity**: High
- **Status**: FIXED — v9 patch
- **Evidence**: Run 11 (n=1), Run 12 (n=1), Run 13 (n=1)
- **Root Cause**: Time budget pressure + weak enforcement in prompts.
- **Fix**: Added `multi_seed_enforcement` block in `prompts.py` with mandatory implementation pattern (3-5 seeds), adaptive seed count, and concrete code template. Injected into code_generation for all sandbox/docker experiments via `executor.py`.
- **Files**: `prompts.py` (new `multi_seed_enforcement` block), `executor.py:2145-2149`

### I-07: Ablation methods produce identical outputs (FIXED)
- **Severity**: High
- **Status**: FIXED — v9.1 patch
- **Evidence**: Run 12 — ablation checker flagged many identical conditions
- **Files**:
  - `executor.py:3838-3866` — identical condition detection (WORKS)
  - `executor.py:3876+` — zero-variance detection across all conditions (NEW)
  - `validator.py:607-658` — deep AST ablation override check (NEW)
  - `prompts.py:969+` — stronger ablation guidance
- **Fix**: Added Check 5 in `validate_experiment_classes()` — compares AST dumps of overridden methods between child and parent classes. If all overrides are identical AST, warns that ablation is fake. Also added R13-1 zero-variance detection in executor analysis stage.

### I-08: RL training steps insufficient (FIXED)
- **Severity**: High
- **Status**: FIXED — v9 patch
- **Evidence**: Run 13 PPO/SAC/TD3 all near-zero reward after 60k steps
- **Root Cause**: RL algorithms need 500k-1M+ steps for MuJoCo tasks
- **Fix**: Added `rl_step_guidance` block in `prompts.py` with per-algorithm minimum steps table (PPO MuJoCo: 500K min / 1M-3M recommended, SAC/TD3: 300K min, etc.), step budget allocation strategy, and evaluation protocol. Auto-detected via topic keywords and injected into both experiment_design and code_generation prompts.
- **Files**: `prompts.py` (new `rl_step_guidance` block), `executor.py:2161-2174` (code_gen), `executor.py:1960-1963` (exp_design)

### I-09: All experiment methods fail (zero metrics) (FIXED)
- **Severity**: Critical
- **Status**: FIXED — Run 7-10 all had this, Runs 11-12 improved
- **Fixes applied**: Docker deps (commit `787172d`), training epochs (commit `787172d`), anti-simulation rules (commit `44151b1`)
- **Verification**: Run 11 still has 0 metrics (QLoRA instability), Run 12 has valid metrics

### I-10: `.ptp()` NumPy 2.0 API removed (FIXED)
- **Severity**: High
- **Status**: FIXED — commit `44151b1`
- **File**: `validator.py` — forbidden patterns detection
- **Fix**: Detect and replace deprecated NumPy APIs before execution

### I-11: Experiment results not framed correctly in paper (FIXED)
- **Severity**: Medium
- **Status**: FIXED — IMP-10 contradiction detection
- **File**: `executor.py:_detect_result_contradictions()`
- **Fix**: Auto-frames null results, warns about negative results

---

## 3. Code Generation Issues

### I-12: Code too simplistic / lazy implementations (FIXED)
- **Severity**: Critical
- **Status**: FIXED — commit `cb4af26`
- **Files**: `validator.py` (AST analysis), `executor.py` (LLM code review stage 10.5)
- **Fix**: Minimum 50 lines per algorithm class, empty subclass detection

### I-13: dict[key] crashes without .get() (FIXED)
- **Severity**: Medium
- **Status**: FIXED — commit `44151b1`
- **File**: `validator.py` — forbidden patterns
- **Fix**: Detect unsafe dict access in generated code

### I-14: LLM tasks use synthetic simulation (FIXED)
- **Severity**: Critical
- **Status**: FIXED — commit `44151b1`
- **File**: `prompts.py` — CRITICAL NO SIMULATION rule
- **Fix**: Prohibit fake training loops with synthetic loss values

### I-15: Missing experiment harness integration (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9.1 patch
- **File**: `docker_sandbox.py:215-222` — `_inject_harness()`, `prompts.py:288-302` — harness guidance
- **Fix**: Changed harness from "RECOMMENDED" to "MANDATORY" in compute_budget prompt block. Added explicit `check_value()` NaN detection and `finalize()` requirement with code examples.

---

## 4. Writing Quality Issues

### I-16: Academic style violations (FIXED)
- **Status**: FIXED — IMP-20, commit `b88aba2`
- **File**: `prompts.py` — `academic_style_guide` block

### I-17: Hedging language throughout paper (FIXED)
- **Status**: FIXED — IMP-31, commit `b88aba2`
- **File**: `prompts.py` — `anti_hedging_rules` block

### I-18: Number repetition across sections (FIXED)
- **Status**: FIXED — IMP-24, commit `b88aba2`
- **File**: `prompts.py` — `anti_repetition_rules` block

### I-19: Title too long / not formatted (FIXED)
- **Status**: FIXED — `title_guidelines` rewrite, commit `b88aba2`
- **File**: `prompts.py` — 14-word limit, MethodName: Subtitle format

### I-20: Abstract too verbose (FIXED)
- **Status**: FIXED — `abstract_structure` rewrite, commit `b88aba2`
- **File**: `prompts.py` — PMR+ format, 180-220 words

---

## 5. Literature & Citation Issues

### I-21: Hallucinated citations (FIXED)
- **Status**: FIXED — Run 11: 90% verified, Run 12: 97.1% verified
- **Files**: `literature/verify.py`, `literature/search.py`

### I-22: Invalid citation markers [?key:NOT_IN_BIB] (FIXED)
- **Status**: FIXED — IMP-29, silent removal
- **File**: `executor.py`

### I-23: Missing seminal papers (FIXED)
- **Status**: FIXED — `data/seminal_papers.yaml` seed library
- **File**: `researchclaw/data/seminal_papers.yaml`

### I-24: Rate-limited API searches (FIXED)
- **Status**: FIXED — commit `63c5a7d`
- **Files**: `arxiv_client.py` (circuit breaker), `openalex_client.py` (new), `semantic_scholar.py` (batch API)

---

## 6. Infrastructure Issues

### I-25: Docker missing ML packages (FIXED)
- **Status**: FIXED — transformers, peft, trl, bitsandbytes, MuJoCo, etc.
- **Commits**: `e72a818`, `787172d`

### I-26: HF cache mount duplication (FIXED)
- **Status**: FIXED — commit `44151b1`
- **File**: `docker_sandbox.py`

### I-27: Dataset pre-caching (FIXED)
- **Status**: FIXED — CIFAR-10/100, FashionMNIST, MNIST in Docker image
- **Commits**: `787172d`

### I-28: Time budget too short for LLM tasks (FIXED)
- **Status**: FIXED — adaptive time budget by task type
- **File**: `executor.py:2145-2160`

### I-29: Non-root pip install failure in Docker (FIXED)
- **Status**: FIXED — `--break-system-packages` flag
- **File**: `docker_sandbox.py`

---

## 7. Pipeline Logic Issues

### I-30: Pipeline proceeds after MAX_DECISION_PIVOTS=2 — quality gate added (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9.1 patch (quality gate added, MAX_PIVOTS=2 kept by design)
- **Files**: `runner.py:299-321` — quality gate check, `runner.py:697-756` — `_check_experiment_quality()`
- **Fix**: Added `_check_experiment_quality()` function that runs before forced PROCEED. Checks: (1) all metrics zero, (2) all conditions identical primary_metric (R13-1), (3) too many ablation warnings, (4) analysis quality score < 3. If any check fails, writes `quality_warning.txt` to run directory and logs QUALITY WARNING. Pipeline still proceeds but the warning is preserved for review.

### I-31: LLM code review JSON parsing failure (FIXED)
- **Status**: FIXED — commit `44151b1`
- **File**: `executor.py:2300-2330`
- **Fix**: Markdown fence stripping, graceful fallback

### I-32: Topic quality not validated against trends (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9.1 patch
- **File**: `prompts.py:986-996` — topic_init prompt
- **Fix**: Added TREND VALIDATION requirement to topic_init prompt: must identify 2-3 recent papers (2024-2026) for relevance, name specific benchmark/dataset, state SOTA results, and include a 'Benchmark' subsection.

---

## 8. New Feature Requests

### F-01: Training Framework Documentation Retrieval (FIXED — Phase 1)
- **Severity**: High — impacts LLM fine-tuning code quality
- **Status**: FIXED (Phase 1: static docs) — v9 patch
- **Description**: When the pipeline needs to generate code using training frameworks (LlamaFactory, TRL, Axolotl), the backbone LLM may not know the correct API usage. The pipeline should:
  1. Detect which framework is needed based on the experiment design
  2. Fetch the framework's official API documentation and example code
  3. Inject relevant documentation into the code generation prompt
  4. Generate code that correctly uses the framework APIs
- **Current Problem**: Generated training code may use incorrect or outdated API calls, leading to experiment failures (e.g., Run 11 QLoRA training diverged)
- **Proposed Approaches**:
  - **Option A: Static doc snippets** — Bundle curated API reference snippets for common frameworks in `researchclaw/data/framework_docs/`. Simple, fast, but requires manual updates.
  - **Option B: Context7-style MCP** — Use Context7 (upstash/context7) to fetch live documentation at runtime via MCP protocol. Always up-to-date, but adds network dependency.
  - **Option C: Git clone + extract** — Clone framework repos at pipeline startup, extract README/docs/examples, summarize via LLM, inject into prompts. Most complete, but slow and requires network.
  - **Option D: Hybrid** — Bundle static docs for top frameworks + fallback to web fetch for unknown ones.
- **Reference Tools**: Cursor `@Docs`, Context7 MCP, Aider web context, OpenHands SDK
- **Target**: Phase 4 or Phase 5
- **Dependencies**: Network access during code generation stage

### F-01 Detailed Design: Framework Doc-RAG

#### Problem Statement

When the pipeline generates experiment code that uses ML training frameworks (LlamaFactory, TRL, Axolotl, transformers Trainer), the backbone LLM (GPT-5.1/GPT-4.1) may not know current API signatures, default parameters, or correct usage patterns. This leads to:

1. **Incorrect API calls** — using removed or renamed functions
2. **Missing config fields** — e.g. LlamaFactory YAML missing required keys
3. **Wrong training patterns** — e.g. calling `Trainer.train()` without `TrainingArguments`
4. **Version mismatch** — framework APIs change between versions installed in Docker vs LLM training data

Evidence: Run 11 (QLoRA) — all 8 methods diverged; likely caused by incorrect training setup.

#### Industry Survey

| Tool | Approach | Pros | Cons |
|------|----------|------|------|
| **DocPrompting** (ICLR 2023) | BM25/dense retrieval over docs → inject into code gen prompt | Academic validation (+2.85% pass@1), open source | Requires pre-built index |
| **Cursor @Docs** | User adds doc URLs, IDE crawls/indexes, injects relevant snippets into LLM context | Real-time, version-aware | Requires IDE, manual URL management |
| **Context7 MCP** | MCP server with 9000+ pre-indexed libraries, `resolve-library-id` + `query-docs` tools | Automatic, 9k libraries, version-specific | Network dependency, closed backend |
| **DSPy DocLearner** | BeautifulSoup scraper → LLM analysis → code generation chain | Fully automated pipeline | Slow, brittle scraping |
| **llms.txt** | Standardized `/llms.txt` markdown file in project root for LLM consumption | Simple, no crawling needed | Requires framework authors to adopt |
| **AI Scientist v2** | No templates, relies purely on LLM knowledge + tree search debugging | Zero setup | Lower success rate, no doc awareness |
| **Continue + MCP** | DeepWiki for GitHub repos + Context7 for docs + `.continue/rules` | Extensible MCP ecosystem | Complex setup |
| **AGENTS.md** | Project-level instructions for AI agents (60k+ projects adopted) | No infra needed | Only project conventions, not API docs |

**Key finding**: No existing autonomous research agent (AI Scientist v1/v2, CodeScientist) dynamically reads documentation at runtime. They all rely on pre-built templates or LLM training data. **Doc-RAG would be a differentiating feature for AutoResearchClaw.**

**Academic evidence**: IBM study shows well-structured documentation improves AI response accuracy by up to **47%**.

#### Available Framework Documentation

| Framework | Docs URL | Format | Key Content |
|-----------|----------|--------|-------------|
| **TRL** | `huggingface.co/docs/trl` | HTML/MD | SFTTrainer, DPOTrainer, GRPOTrainer, RewardTrainer |
| **LlamaFactory** | `llamafactory.readthedocs.io` | HTML/RST | YAML config, CLI, SFT/DPO/RLHF/KTO/ORPO |
| **Axolotl** | `docs.axolotl.ai` | HTML/MD | YAML config, LoRA/QLoRA/GPTQ, full/DPO/GRPO |
| **PEFT** | `huggingface.co/docs/peft` | HTML/MD | LoRA/QLoRA config, get_peft_model |
| **transformers** | `huggingface.co/docs/transformers` | HTML/MD | Trainer, TrainingArguments, AutoModel |

#### Recommended Implementation: Hybrid Static + Web Fetch

**Phase 1 (Static — immediate):**
- Create `researchclaw/data/framework_docs/` directory
- Bundle curated API snippets for top 5 frameworks:
  - `trl.md` — SFTTrainer, DPOTrainer, PPOTrainer usage + config
  - `llamafactory.md` — YAML config format, CLI usage, dataset format
  - `transformers_trainer.md` — TrainingArguments, Trainer, PEFT integration
  - `peft.md` — LoRA/QLoRA config, get_peft_model, prepare_model_for_kbit_training
  - `axolotl.md` — YAML config format, training modes
- In `executor.py:_execute_code_generation()`, detect framework from experiment design
- Inject matching doc snippet into code generation prompt as `{framework_reference}`
- **Effort**: ~4 hours, no network dependency

**Phase 2 (Web Fetch — later):**
- Add `FrameworkDocFetcher` class in `researchclaw/literature/framework_docs.py`
- On experiment_design detection of framework name:
  1. Check if `llms.txt` exists at framework's docs URL
  2. If yes, fetch and extract relevant sections
  3. If no, fall back to static bundle
- Cache fetched docs locally (`.researchclaw_cache/framework_docs/`)
- TTL: 7 days (frameworks don't change API that often)
- **Effort**: ~8 hours, requires network during code gen stage

**Phase 3 (Context7 MCP — optional):**
- Integrate Context7 MCP client for automatic library discovery
- `resolve-library-id("trl")` → `"/huggingface/trl"`
- `query-docs("/huggingface/trl", "SFTTrainer config")` → relevant docs
- Most complete solution but adds external service dependency

#### Phase 1 Implementation (COMPLETED — v9 patch)

- Created `researchclaw/data/framework_docs/` with 5 curated API reference files:
  - `trl.md` — SFTTrainer, DPOTrainer, GRPOTrainer, PPOTrainer, PEFT integration
  - `peft.md` — LoRA, QLoRA, DoRA configs, save/load, target_modules by model
  - `transformers_training.md` — TrainingArguments, Trainer, tokenization, causal LM
  - `llamafactory.md` — YAML config, CLI, dataset formats, DPO, export
  - `axolotl.md` — YAML config, dataset formats, DPO, multi-GPU
- Added `detect_frameworks()` and `load_framework_docs()` in `researchclaw/data/__init__.py`
- Injected into both `experiment_design` and `code_generation` stages in `executor.py`
- Auto-detection based on topic + hypothesis + experiment plan keywords
- Max 8000 chars for code_generation, 4000 chars for experiment_design (to avoid context overflow)

#### Integration Point in Pipeline

```
Stage 9: experiment_design → detects framework (e.g., "use TRL SFTTrainer")
                ↓
Stage 10: code_generation prompt += framework_reference doc snippet
                ↓
Stage 10.5: code_review → validates API calls against doc snippet
                ↓
Stage 12: execution → framework is already installed in Docker
```

#### Framework Detection Heuristics

```python
FRAMEWORK_KEYWORDS = {
    "trl": ["SFTTrainer", "DPOTrainer", "PPOTrainer", "trl", "RewardTrainer"],
    "llamafactory": ["LlamaFactory", "llama_factory", "llamafactory"],
    "peft": ["LoRA", "QLoRA", "get_peft_model", "PeftConfig"],
    "transformers": ["Trainer", "TrainingArguments", "AutoModelForCausalLM"],
    "axolotl": ["axolotl"],
}
```

---

## 9. Run 13 Findings (RL Benchmark — PPO/SAC/TD3 with PER on MuJoCo)

### R13-1: All conditions produce identical metrics (FIXED)
- **Severity**: Critical
- **Status**: FIXED — v9.1 patch
- **Evidence**: Run 13 — all 6 algorithm/PER conditions had identical primary_metric (0.1074)
- **Root Cause**: Condition → implementation mapping broken in generated code; ablation checker caught it but too late
- **Fix**: Added zero-variance detection in executor.py analysis stage (line 3876+). Added to `_check_experiment_quality()` gate in runner.py. AST validation in validator.py now catches fake ablation subclasses.

### R13-2: Gymnasium v4 environments deprecated (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9.1 patch
- **Evidence**: Run 13 warnings: "The environment HalfCheetah-v4 is out of date"
- **Fix**: Added v5 environment requirement to `rl_step_guidance` prompt block in prompts.py.

### R13-3: No learning curve logging for RL (FIXED)
- **Severity**: Medium
- **Status**: FIXED — v9.1 patch
- **Evidence**: Run 13 only reported final metrics, no step-by-step evaluation
- **Fix**: Added learning curve logging requirement to `rl_step_guidance` prompt: `EVAL:` lines every N_eval steps, `LEARNING_CURVE:` summary at end.

---

## 10. Feature Requests — Advanced Code Generation

### F-02: Advanced Coding Agent for Experiment Code Generation (OPEN)
- **Severity**: Critical (pipeline capability ceiling)
- **Status**: OPEN — research complete, implementation pending
- **Problem**: Current code generation stage produces relatively simple, single-file experiments. Cannot design large-scale multi-file projects (e.g., complex RL systems with custom environments, multi-component fine-tuning pipelines). This limits paper quality and experiment sophistication.
- **Goal**: Replace single-shot code generation with an agentic coding system capable of iterative development, debugging, and multi-file project design — analogous to how Claude Code or Devin can build complex projects from scratch.

#### Research Summary

**Design Patterns Identified** (from survey of 12+ systems: Claude Code, Cursor, Devin, SWE-Agent, OpenHands, Aider, MetaGPT, ChatDev, AI Scientist v2, AIDE, AgentCoder, AlphaCodium):

| Pattern | Description | Key Systems | Impact |
|---------|-------------|-------------|--------|
| A: Architect-then-Code | Separate planning step → architecture spec → code generation | Aider, MetaGPT | HIGH |
| B: Solution Tree Search | Solutions as tree nodes; branch, evaluate, prune | AI Scientist v2, AIDE | CRITICAL |
| C: Execution-in-the-Loop | Generate → execute → parse error → fix loop | Claude Code, SWE-Agent | HIGH |
| D: Multi-Agent Review | Coder + reviewer dialog with iterative refinement | ChatDev, AgentCoder | MEDIUM |
| E: Tool-Augmented Generation | File R/W, terminal, linting, search as LLM tools | Claude Code, SWE-Agent | HIGH |
| F: Context Engineering | Repo maps, compression, selective context inclusion | Aider, Claude Code | MEDIUM |

#### Implementation Plan — 4 Phases

**Phase 1: Architect-then-Code** (Priority: HIGH)
- Add architecture planning substage before code generation
- LLM produces file structure, class hierarchy, data flow diagram
- Code generation uses architecture spec as constraint
- Files: `researchclaw/pipeline/executor.py` (new substage), `researchclaw/prompts.py` (architecture prompt)

**Phase 2: Execution-in-the-Loop** (Priority: HIGH)
- After initial code generation, run code in sandbox
- Parse stderr/stdout for errors
- Feed errors back to LLM for iterative fix (max N iterations)
- Already partially exists in current REFINE loop — needs to be tightened into inner code-fix loop
- Files: `researchclaw/pipeline/executor.py`, `researchclaw/experiment/docker_sandbox.py`

**Phase 3: Solution Tree Search** (Priority: CRITICAL)
- Multiple candidate solutions generated in parallel
- Each evaluated via sandbox execution (runtime errors, metric quality)
- Best candidate selected or merged; backtrack on failures
- Inspired by AIDE/AI Scientist v2 tree search pattern
- Files: New `researchclaw/pipeline/code_agent.py`, `researchclaw/pipeline/executor.py`

**Phase 4: Multi-Agent Review** (Priority: MEDIUM)
- Coder agent generates code, reviewer agent critiques
- Dialog continues until reviewer approves or max rounds reached
- Catches logical errors, missing edge cases, poor experiment design
- Files: `researchclaw/pipeline/code_agent.py`

#### Task Breakdown

| Task ID | Phase | Description | Status | Depends On |
|---------|-------|-------------|--------|------------|
| F-02-1 | 1 | Design architecture planning prompt and substage | DONE | — |
| F-02-2 | 1 | Implement architect substage in executor.py | DONE | F-02-1 |
| F-02-3 | 1 | Wire architecture spec into code generation prompt | DONE | F-02-2 |
| F-02-4 | 2 | Implement inner code-fix loop (generate → run → fix) | DONE | F-02-3 |
| F-02-5 | 2 | Add error parsing and structured feedback extraction | DONE | F-02-4 |
| F-02-6 | 2 | Configure max iterations and timeout for fix loop | DONE | F-02-5 |
| F-02-7 | 3 | Design solution tree data structure and evaluation | DONE | F-02-6 |
| F-02-8 | 3 | Implement parallel candidate generation | DONE | F-02-7 |
| F-02-9 | 3 | Implement tree search: branch, evaluate, prune, select | DONE | F-02-8 |
| F-02-10 | 4 | Implement reviewer agent prompt and dialog loop | DONE | F-02-9 |
| F-02-11 | — | End-to-end test with complex RL experiment | DONE | F-02-10 |

#### Live Test Results (2026-03-15)

Branch: `feat/advanced-code-agent` | Commits: `93d3233`, `4b91ac9`

| Test | Topic | Model | Score | Total Lines | Eff. Lines | Classes | Key Quality |
|------|-------|-------|-------|-------------|------------|---------|-------------|
| T1 | ViT CIFAR-10 | GPT-4.1 (v1) | 9.1/10 | 505 | 409 | 11 | Thin wrappers (3 lines each) |
| T1 | ViT CIFAR-10 | GPT-4.1 (v2) | 10.0/10 | 589 | 488 | 4 | Substantial (45-79 lines/class) |
| T1 | ViT CIFAR-10 | GPT-5.1 (v2) | 9.7/10 | 1425 | 1120 | 12 | 6 files, custom transformer blocks |
| T2 | OOD Detection | GPT-5.2 (v2) | 10.0/10 | 1309 | 1033 | 14 | SNGP, Deep Ensemble, MC Dropout |
| T2 | OOD Detection | GPT-5.1 (v2) | 9.7/10 | 1541 | 1257 | 10 | Spectral norm, RFF, GP posterior |
| T3 | Meta-Learning | GPT-5.1 (v2) | 10.0/10 | 1366 | 1140 | 9 | MAML functional_call, Reptile, ProtoNet |

**Key findings:**
- v1→v2 prompt fix: method_richness improved from 4.4/10 to 10/10 (thin wrapper elimination)
- GPT-5.1 consistently produces 1100-1250 effective lines with proper algorithmic depth
- MAML implementation uses `torch.nn.utils.stateless.functional_call` for correct second-order gradients
- SNGP implementation includes Random Fourier Features with diagonal GP posterior
- All generated code passes syntax validation; no runtime errors in static analysis

**Remaining observations (not blocking):**
- MAMLFirstOrderLearner has significant code duplication with MAMLLearner (could use flag)
- LinearClassifierHead flagged as thin (9 lines) — acceptable for utility module
- GPT-5.2 timed out on preflight check (reasoning model min 32768 tokens)

---

## Appendix A: Issue-to-Run Mapping

| Run | Issues Hit | Quality Score |
|-----|-----------|---------------|
| Run 7 | I-03, I-04, I-09, I-12, I-16-I-20 | 3/10 |
| Run 8-10 | I-09, I-10, I-25, I-27 | Not scored |
| Run 11 | I-01 (OK), I-06, I-09 (QLoRA diverge) | 7/10 |
| Run 12 | I-01, I-06, I-07, I-15 | 7.5/10 |
| Run 13 | I-06, I-07, I-08, I-30, R13-1/2/3 | 3/10 (REFINE decision) |

## Appendix B: Fix Commit Reference

| Commit | Description | Issues Fixed |
|--------|-------------|-------------|
| `3792fd6` | V4 improvements | I-03, I-04 |
| `cb4af26` | Phase 1 code quality | I-12 |
| `e72a818` | Phase 2 LLM fine-tuning | I-25, I-28 |
| `787172d` | Phase 0 diagnostics | I-09, I-10, I-27 |
| `44151b1` | Phase 3 regression fixes | I-13, I-14, I-26, I-29, I-31 |
| `b88aba2` | V8 merge | I-05, I-16-I-20, I-22 |
| `63c5a7d` | Rate-limit defense | I-24 |

## Appendix C: Files Most Frequently Modified

| File | Issue Count | Lines |
|------|------------|-------|
| `researchclaw/pipeline/executor.py` | 14 | ~6000 |
| `researchclaw/prompts.py` | 10 | ~2500 |
| `researchclaw/templates/converter.py` | 5 | ~1200 |
| `researchclaw/experiment/docker_sandbox.py` | 3 | ~420 |
| `researchclaw/docker/Dockerfile` | 3 | ~45 |


================================================
FILE: docs/iteration_plan_v8.md
================================================
# AutoResearchClaw Pipeline — 持续迭代改进方案 V8

> 创建日期: 2026-03-15
> 基于: V7 质量修复 (P1-P14) + Run 1-7 测试反馈
> 目标: 将 pipeline 从 3/10 提升至 7+/10 审稿人评分

---

## 一、当前问题总览

### 1.1 已确认的核心问题

| ID | 问题 | 严重程度 | 类别 |
|----|------|----------|------|
| Q1 | **代码过于简单/偷懒** — LLM 生成的实验代码复杂度不足，缺乏真正的算法实现深度 | 🔴 Critical | 代码质量 |
| Q2 | **不支持 LLM 微调任务** — 无法使用 Llama-Factory/TRL/Axolotl 等框架进行模型训练 | 🔴 Critical | 能力缺失 |
| Q3 | **Docker 环境缺失关键包** — transformers, PEFT, TRL, datasets, accelerate 未预装 | 🔴 Critical | 基础设施 |
| Q4 | **计算预算不匹配** — 默认 600s 完全不够 LLM 微调/复杂训练任务 | 🟡 High | 配置 |
| Q5 | **数据集指导不全** — 只覆盖图像分类(CIFAR-10/FashionMNIST)，缺少 NLP/多模态数据集 | 🟡 High | 提示工程 |
| Q6 | **缺少先进训练技巧指导** — 无混合精度、梯度累积、LoRA/QLoRA 等指导 | 🟡 High | 提示工程 |
| Q7 | **选题缺乏前沿性验证** — topic_init 阶段无法确保选题与最新会议趋势对齐 | 🟡 High | 提示工程 |
| Q8 | **实验设计与代码脱节** — experiment_design 阶段产出的方案过于抽象，代码难以还原 | 🟠 Medium | 流程 |
| Q9 | **消融实验质量低** — 消融 variant 经常与 baseline 结果相同（代码偷懒） | 🟠 Medium | 代码质量 |
| Q10 | **论文写作质量待提升** — 数字重复、结构松散、结论与实验脱节 | 🟠 Medium | 写作 |

### 1.2 硬件环境

| 资源 | 配置 |
|------|------|
| GPU | NVIDIA RTX 6000 Ada (49GB VRAM) |
| 可训练模型 | Full FT: ≤3B; LoRA 16-bit: ≤14B; QLoRA 4-bit: ≤72B |
| 最优甜点 | Qwen-2.5-14B + QLoRA (rank 32, batch 2-4, seq 4096) |
| 极限模型 | Qwen-2.5-72B + QLoRA (rank 8-16, batch 1, seq 1024) |
| 推荐框架 | Llama-Factory (内置 Unsloth 加速)、TRL (RLHF/DPO) |

---

## 二、迭代方案概览

### 总体路线

```
Phase 0: 诊断测试 (Run 8-10)     ← 当前阶段
    ↓ 发现问题
Phase 1: 代码质量根本性改进
    ↓ 修复后
Phase 2: LLM 微调能力扩展
    ↓ 新能力
Phase 3: 回归测试 (Run 11-13)
    ↓ 验证
Phase 4: 高级特性 & 持续迭代
    ↓ 长期
Phase N: 持续监控与改进
```

---

## 三、Phase 0: 诊断测试 (Run 8-10)

### 目标
并行运行 3 个精心选择的主题，覆盖不同类型的研究任务，以审稿人视角全面评估代码和论文质量。

### 3 个测试主题

#### Run 8: 经典 ML + 视觉任务
**主题**: "Adaptive Per-Layer LoRA Rank Allocation for Memory-Optimal Fine-Tuning of Vision Transformers"
- **为什么选这个**: 测试 pipeline 能否生成涉及 LoRA、ViT、多实验对比的复杂代码
- **预期难点**: 需要 transformers + PEFT 库，需要多层分析逻辑
- **关注指标**: 代码是否真正实现了逐层 rank 分配，还是偷懒用了统一 rank

#### Run 9: 强化学习 + 策略优化
**主题**: "Comparing Flow Matching, Diffusion, and Consistency Models as Generative Trajectory Policies for Offline Reinforcement Learning"
- **为什么选这个**: 测试 pipeline 能否正确实现 3 种不同的生成模型并进行公平对比
- **预期难点**: 需要 D4RL 数据集，三种算法各有复杂实现
- **关注指标**: 每种算法是否有独立的完整实现，还是共享同一套代码换个名字

#### Run 10: LLM 推理 + 计算效率
**主题**: "First-Token Reasoning Quality as a Predictor for Adaptive Test-Time Compute Allocation in Language Models"
- **为什么选这个**: 测试 pipeline 能否处理 LLM 推理/效率优化类任务
- **预期难点**: 需要加载 Qwen-2.5-7B/14B，需要 token-level 分析
- **关注指标**: 是否真正加载了模型进行推理，还是用假数据模拟

### 评审清单

对每个 Run 的输出，按以下维度打分（1-10）：

| 维度 | 评审要点 |
|------|----------|
| **代码完整性** | 是否实现了实验设计中描述的所有算法？有无偷懒/跳过？ |
| **代码复杂度** | 代码是否达到了论文级别的复杂度？是否有非平凡的算法实现？ |
| **框架使用** | 是否正确使用了所需的框架/库？调用方式是否正确？ |
| **实验公平性** | 对比实验是否使用了相同的随机种子、数据划分、评估协议？ |
| **结果可信度** | 结果是否合理？是否有明显的造假/随机数伪造痕迹？ |
| **消融有效性** | 消融实验是否真正去除了关键组件？结果是否有区分度？ |
| **论文与代码一致性** | 论文中描述的方法是否与代码实现一致？ |
| **写作质量** | 论文结构、数字使用、引用质量是否达标？ |

### 期望产出
- 每个 Run 的详细评审报告
- 发现的新问题列表（追加到本文档）
- 更新 Phase 1 的优先级排序

---

## 四、Phase 1: 代码质量根本性改进

### P1.1 代码复杂度强制要求

**文件**: `researchclaw/prompts.py`
**问题**: 当前 code_generation prompt 虽然有很多规则，但缺乏对算法实现深度的硬性要求

**改进方案**:
1. 添加 `code_complexity` block:
   - 每个算法/方法必须有独立的 class 实现（不能是函数别名）
   - 每个 class 必须有 `__init__`, `forward/predict`, `train_step` 三个核心方法
   - 主要算法 class 不少于 50 行有效代码
   - 消融变体必须通过修改算法逻辑实现，不能仅改超参数
   - 禁止 `class MethodB(MethodA): pass` 这种空继承

2. 添加 `implementation_depth` 检查:
   - 在 validator.py 中新增复杂度评分
   - 检查每个 class 的方法数量和代码行数
   - 检查是否存在 "名不副实" 的类（如 BayesianOpt 但没有 acquisition function）

**状态**: ⬜ 待实施

### P1.2 算法实现正确性验证

**文件**: `researchclaw/pipeline/executor.py`
**问题**: 当前仅做语法检查和安全扫描，不验证算法是否正确实现

**改进方案**:
1. 在代码生成后增加 `_verify_algorithm_implementation()` 阶段:
   - 用 LLM 审查代码，逐条检查实验设计中的每个组件是否在代码中实现
   - 生成 checklist: `✅ PPO clipped surrogate objective implemented` / `❌ Missing value function baseline`
   - 如果有 ❌ 项，触发代码修复循环

2. 添加 `_verify_condition_independence()`:
   - 解析代码 AST，检查每个实验条件/方法的 class 是否有独立的逻辑
   - 如果两个 class 的方法体完全相同（hash 匹配），标记为 "identical implementation"
   - 注入警告到修复 prompt 中

**状态**: ⬜ 待实施

### P1.3 实验设计→代码 桥接增强

**文件**: `researchclaw/prompts.py`, `researchclaw/pipeline/executor.py`
**问题**: experiment_design 产出的方案过于抽象，code_generation 难以还原

**改进方案**:
1. 在 experiment_design prompt 中增加 "Pseudocode" 要求:
   - 每个方法/算法必须给出伪代码级别的描述
   - 明确输入输出 tensor shape
   - 明确 loss function 公式
   - 明确 training loop 结构

2. 在 code_generation 中注入 pseudocode 上下文:
   - 从 experiment_design 输出中提取伪代码部分
   - 作为 code_generation prompt 的一部分传入
   - 明确要求 "代码必须与伪代码逻辑一致"

**状态**: ⬜ 待实施

### P1.4 代码审查自动化（LLM-as-Reviewer）

**文件**: `researchclaw/pipeline/executor.py` (新阶段)
**问题**: 代码生成后无系统性审查

**改进方案**:
1. 在 Stage 10 (code_generation) 和 Stage 11 (experiment_run) 之间插入 Stage 10.5:
   - `_execute_code_review()` 方法
   - 用 LLM 以审稿人视角审查代码
   - 生成审查报告: 实现完整性、算法正确性、代码质量
   - 如果审查不通过，返回 Stage 10 重新生成（最多 2 次）

2. 审查 prompt 关注点:
   - 算法命名是否与实现一致？
   - 是否存在 "假实现"（名字是 X 但代码是 Y）？
   - 数据处理是否合理？
   - 损失函数是否正确？
   - 评估协议是否科学？

**状态**: ⬜ 待实施

---

## 五、Phase 2: LLM 微调能力扩展

### P2.1 Docker 环境升级

**文件**: `researchclaw/docker/Dockerfile`

**新增包**:
```dockerfile
# LLM Training Stack
RUN pip install --no-cache-dir \
    transformers>=4.46.0 \
    datasets>=3.0.0 \
    accelerate>=1.0.0 \
    peft>=0.13.0 \
    trl>=0.12.0 \
    bitsandbytes>=0.44.0 \
    sentencepiece \
    protobuf \
    tokenizers \
    safetensors \
    flash-attn --no-build-isolation \
    wandb

# Optional: Llama-Factory
RUN pip install --no-cache-dir llamafactory>=0.9.0
```

**状态**: ⬜ 待实施

### P2.2 LLM 微调 Prompt 体系

**文件**: `researchclaw/prompts.py`

新增 prompt blocks:

1. **`llm_training_guidance`** block:
   - 何时使用 LoRA vs QLoRA vs Full FT
   - GPU 内存估算公式
   - 推荐框架选择指南（Llama-Factory / TRL / 原生 transformers）
   - 训练超参数模板（lr, warmup, scheduler, gradient accumulation）
   - 模型加载方式 (AutoModelForCausalLM + BitsAndBytesConfig)

2. **`llm_eval_guidance`** block:
   - 标准评估基准 (MMLU, MT-Bench, AlpacaEval, HumanEval)
   - 评估框架 (lm-eval-harness, vllm 推理加速)
   - 评估指标定义

3. **`llm_data_guidance`** block:
   - 指令微调数据格式 (Alpaca, ShareGPT, OpenAI chat)
   - HuggingFace datasets 加载方式
   - 数据预处理 pipeline (tokenization, padding, truncation)
   - 常用数据集列表 (Alpaca, ShareGPT, MetaMathQA, CodeAlpaca)

**状态**: ⬜ 待实施

### P2.3 计算预算自适应

**文件**: `researchclaw/pipeline/executor.py`, `researchclaw/config.py`

**改进方案**:
1. 根据研究主题自动调整 time_budget:
   - 经典 ML (CIFAR 级): 600s
   - 中等 (ViT/ResNet 训练): 1800s
   - LLM 微调 (7B LoRA): 7200s (2h)
   - LLM 微调 (14B QLoRA): 14400s (4h)
   - 大规模训练 (72B): 43200s (12h)

2. 在 experiment_design 阶段估算所需计算量:
   - 根据模型大小、数据量、训练 epoch 预估时间
   - 自动设置合理的 time_budget

**状态**: ⬜ 待实施

### P2.4 模型缓存与下载管理

**文件**: `researchclaw/experiment/docker_sandbox.py`

**改进方案**:
1. 支持 HuggingFace Hub 模型缓存目录挂载:
   - 宿主机 `~/.cache/huggingface` → 容器 `/root/.cache/huggingface`
   - 避免每次运行重新下载模型

2. 网络策略调整:
   - LLM 微调任务: `network_policy: "huggingface_only"` (仅允许 HF Hub 下载)
   - 传统 ML 任务: `network_policy: "pip_only"` 或 `"none"`

**状态**: ⬜ 待实施

---

## 六、Phase 3: 回归测试 (Run 11-13)

### 测试主题（Phase 1-2 完成后执行）

#### Run 11: LLM 微调任务
**主题**: "QLoRA Rank Allocation: Adaptive Per-Layer Rank Selection for Memory-Optimal Fine-Tuning of Qwen-2.5"
- **目的**: 验证 P2 (LLM 微调能力) 是否正确工作
- **验证点**: 能否正确调用 PEFT/QLoRA，能否加载 Qwen-2.5 模型

#### Run 12: VLM 推理分析
**主题**: "Modular Causal Attribution for Hallucination Mitigation in Vision-Language Models via MHA Intervention"
- **目的**: 验证 pipeline 能否处理多模态任务
- **验证点**: 代码复杂度是否达标，分析方法是否正确

#### Run 13: 经典 RL 复杂实验
**主题**: "Generative Trajectory Policies: Flow Matching vs Diffusion vs Consistency Models for Offline RL on D4RL"
- **目的**: 验证 P1 (代码质量改进) 是否有效
- **验证点**: 三种算法是否有独立完整实现

### 评分标准
- 每个 Run 使用 Phase 0 的评审清单打分
- 目标: 所有维度 ≥ 6/10，平均 ≥ 7/10
- 如果不达标，回到 Phase 1/2 继续修复

---

## 七、Phase 4: 高级特性 & 持续迭代

### P4.1 基准发现系统 (Benchmark Discovery)
- 在 experiment_design 阶段新增 LLM 调用，自动推荐相关基准和 SOTA 基线
- 已测试: LLM 知识法（Plan 2）效果极佳，可找到 40+ 基准

### P4.2 实验复现性保障
- 记录完整的环境信息 (pip freeze, CUDA version, GPU type)
- 自动生成 requirements.txt
- 支持实验结果复现

### P4.3 多 GPU 分布式训练支持
- DeepSpeed / FSDP 集成
- 多节点训练配置

### P4.4 论文质量进一步提升
- LaTeX 格式化增强
- 图表自动优化（配色、字体、分辨率）
- 引用格式严格化

### P4.5 端到端自动评估
- 集成 LLM-as-Judge 对生成论文自动打分
- 与人工审稿打分对比校准
- 建立质量基线

---

## 八、跟踪记录

### 测试运行记录

| Run | 日期 | 主题 | 模式 | 代码评分 | 论文评分 | 发现的问题 |
|-----|------|------|------|----------|----------|------------|
| 1 | 2026-03-xx | Continual Learning | sandbox | - | - | Bug 1-4 |
| 2 | 2026-03-xx | RIM Agents | sandbox | - | - | Bug 1-4 |
| 3 | 2026-03-xx | (与 Run 1 同主题) | sandbox | - | - | Bug 1-4 |
| 4 | 2026-03-xx | RL for AI4Science | sandbox | 4/10 | - | Bug 5-8, 变量作用域, 5/7条件崩溃 |
| 5 | 2026-03-xx | Graph Neural ODE | sandbox | 4/10 | - | Bug 5-8, nn.Linear in forward, no-op ablation |
| 6 | 2026-03-xx | Meta-Learning | sandbox | - | - | Bug 5-8 |
| 7 | 2026-03-14 | Normalization Techniques | docker | 3/10 | 3/10 | P1-P14 |
| 8 | 2026-03-15 | KD Comparison (CIFAR-10) | docker | 5/10 | - | Q13-Q15, 随机水平结果 |
| 9 | 2026-03-15 | PPO/SAC/TD3+PER | docker | 7/10 | - | Q11, MuJoCo缺失致完全失败 |
| 10 | 2026-03-15 | Neural ODE Robustness | docker | 7/10 | - | Q12/Q16, CIFAR-10挂载失败 |
| 11 | 2026-03-15 | QLoRA Rank Allocation | docker | 4/10 | 7/10 | Q17-Q20, 合成模拟非真实训练 |
| 12 | 2026-03-15 | VLM Hallucination | docker | 3/10 | TBD | Q21-Q23, KeyError崩溃, 训练/验证数据重叠 |
| 13 | 2026-03-15 | PPO/SAC/TD3 MuJoCo | docker | 6/10 | TBD | Q24-Q26, 60k步不够收敛, PPO容量不公平 |

### 问题追踪

| 问题 ID | 描述 | Phase | 状态 | 修复 Commit |
|---------|------|-------|------|-------------|
| Q1 | 代码过于简单/偷懒 | P1 | ✅ 已修复 | cb4af26 |
| Q2 | 不支持 LLM 微调 | P2 | ✅ 已修复 | e72a818 |
| Q3 | Docker 缺关键包 | P2 | ✅ 已修复 | e72a818 |
| Q4 | 计算预算不匹配 | P2 | ✅ 已修复 | e72a818 |
| Q5 | 数据集指导不全 | P1/P2 | ✅ 已修复 | (本次) |
| Q6 | 缺先进训练技巧 | P2 | ✅ 已修复 | e72a818 |
| Q7 | 选题前沿性验证 | P4 | ⬜ 待实施 | - |
| Q8 | 实验设计与代码脱节 | P1 | ✅ 已修复 | cb4af26 |
| Q9 | 消融实验质量低 | P1 | ✅ 已修复 | cb4af26 |
| Q10 | 论文写作质量 | P4 | 🟡 V7已部分修复 | - |
| Q11 | Docker 缺 MuJoCo | P0 | ✅ 已修复 | (本次) |
| Q12 | CIFAR-10 挂载失效 | P0 | ✅ 已修复 | (本次,重建镜像) |
| Q13 | 训练 epoch 过少 | P0 | ✅ 已修复 | (本次) |
| Q14 | Feature KD 维度不匹配 | P1 | 🟡 P1代码审查会捕获 | - |
| Q15 | 消融与 baseline 重复 | P1 | 🟡 P1深度验证会捕获 | - |
| Q16 | 缺少关键实验条件 | P1 | 🟡 P1代码审查会捕获 | - |
| Q17 | Docker HF缓存重复挂载 | P3 | ✅ 已修复 | (本次) |
| Q18 | LLM代码审查JSON解析失败 | P3 | ✅ 已修复 | (本次) |
| Q19 | LLM任务用合成模拟代替真实训练 | P3 | ✅ 已修复(提示) | (本次) |
| Q20 | ndarray.ptp()等NumPy 2.0移除API | P3 | ✅ 已修复(检测+提示) | (本次) |
| Q21 | dict[key]无默认值致KeyError | P3 | ✅ 已修复(提示) | (本次) |
| Q22 | 训练/验证数据集重叠 | P4 | ⬜ 待实施 | - |
| Q23 | 损失函数方向错误(鼓励而非惩罚) | P4 | ⬜ 待实施 | - |
| Q24 | RL训练步数不足(60k vs 需1M) | P3 | 🟡 已有epoch指导,需扩展到RL | - |
| Q25 | 实验条件间模型容量不公平 | P4 | ⬜ 待实施 | - |
| Q26 | proposed_method_variant与主方法相同 | P1 | 🟡 P1深度验证会捕获 | - |

---

## 九、测试选题库

### 优先级 A — Phase 0 诊断测试用

| # | 主题 | 类型 | 预期复杂度 | GPU 时间 |
|---|------|------|-----------|----------|
| A1 | Adaptive Per-Layer LoRA Rank Allocation for ViT | 高效训练 | 高 | 4-8h |
| A2 | Flow Matching vs Diffusion vs Consistency for Offline RL | 强化学习 | 高 | 6-10h |
| A3 | First-Token Reasoning Quality for Compute Allocation | LLM 推理 | 中 | 3-6h |

### 优先级 B — Phase 2 后 LLM 微调测试

| # | 主题 | 类型 | 预期复杂度 | GPU 时间 |
|---|------|------|-----------|----------|
| B1 | QLoRA Fine-Tuning of Qwen-2.5-14B for Medical QA | LLM 微调 | 高 | 4-8h |
| B2 | GainLoRA++ for LLM Continual Learning | 持续学习 | 高 | 6-10h |
| B3 | Spurious Forgetting Analysis in Instruction-Tuned LLMs | LLM 分析 | 中 | 4-8h |

### 优先级 C — 多样性覆盖测试

| # | 主题 | 类型 | 预期复杂度 | GPU 时间 |
|---|------|------|-----------|----------|
| C1 | Modular Causal Attribution for VLM Hallucination | VLM 分析 | 中 | 3-6h |
| C2 | Neural Operator Downscaling for Weather Prediction | AI4Science | 中 | 4-8h |
| C3 | Meta-Learned LoRA Initialization for Few-Shot Adaptation | Meta-Learning | 中 | 4-8h |
| C4 | Prune-Then-LoRA for Parameter-Efficient Fine-Tuning | 高效训练 | 中 | 4-8h |
| C5 | Decomposition-of-Thought for VLM Reasoning | VLM 推理 | 低 | 2-4h |

---

## 十、执行计划

### 执行进度

#### Phase 0: 诊断测试 ✅ 完成
1. ✅ 调研热门主题，筛选测试用 idea
2. ✅ 为 Run 8/9/10 创建配置文件
3. ✅ 并行启动 3 个 Run
4. ✅ 监控中间输出，特别关注 Stage 10 (代码生成) 产出
5. ✅ 以审稿人视角评审代码 + 论文
6. ✅ 汇总发现的问题 (Q11-Q16)
7. ✅ 确定 Phase 1 优先级

#### Phase 1: 代码质量改进 ✅ 完成 (commit cb4af26)
- P1.1: 深度代码质量检查 (AST分析: 类质量, 变量作用域, API正确性)
- P1.2: 自动修复循环 (深度验证 → LLM修复 → 重验证)
- P1.3: 实验设计增加 implementation_spec (伪代码级描述)
- P1.4: LLM代码审查 (Stage 10.5, 评分1-10, 严重问题触发修复)

#### Phase 2: LLM微调能力 ✅ 完成 (commit e72a818)
- P2.1: Docker新增transformers/peft/trl/bitsandbytes/datasets
- P2.2: llm_training_guidance + llm_eval_guidance 提示块
- P2.3: 自动检测LLM主题注入指导; time_budget警告
- P2.4: HuggingFace缓存挂载 + HF_TOKEN透传

#### Phase 3: 回归测试 🔄 进行中
Run 11-13 结果分析:
- **Run 11 (QLoRA)**: 代码4/10, 论文7/10 — 合成模拟非真实训练, 但论文质量达标
- **Run 12 (VLM)**: 代码3/10 — KeyError崩溃, 训练/验证重叠, 损失方向错误
- **Run 13 (RL)**: 代码6/10 — MuJoCo成功! 但60k步不够收敛, PPO容量不公平

Phase 3 修复 (本次commit):
- Q17: Docker HF缓存重复挂载 → 优先HF_HOME, 避免重复
- Q18: LLM代码审查JSON解析失败 → 正确提取LLMResponse.content + 去除markdown fence
- Q19: LLM任务合成模拟 → 添加"CRITICAL — NO SIMULATION"规则
- Q20: NumPy 2.0移除API → 检测器 + 禁止模式更新
- Q21: dict[key]无默认值 → 禁止模式更新

### 注意事项
- 每次迭代结束后更新本文档
- 新发现的问题立即追加到问题追踪表
- 修复后必须有对应的回归测试 Run
- 配置文件中 API key 已在 .gitignore 中排除


================================================
FILE: docs/iteration_showcase_narrative.md
================================================
# AutoResearchClaw: Self-Iterating Experiment Optimization — Showcase

> Figure: `docs/figures/iteration_improvement_showcase.png` / `.pdf`

---

## Overview

This figure demonstrates AutoResearchClaw's core capability: **autonomous self-iteration of experimental methods**. Starting from an initial experiment design, the pipeline automatically:

1. Runs the experiment in a sandboxed environment
2. Analyzes the results and identifies weaknesses
3. Proposes algorithmic improvements via LLM reasoning
4. Implements code modifications and re-runs
5. Retains the best-performing version, discards regressions

Below we describe two representative cases from actual pipeline runs.

---

## Case A: Continual Meta-Learning for Few-Shot Adaptation

**Research Topic:** Designing meta-learning algorithms that adapt to non-stationary task distributions, where the underlying data distribution shifts over time.

**Metric:** Post-adaptation query error on held-out tasks (lower = better). Converted to accuracy (%) in the figure.

### Iteration Progression

| Round | Accuracy | What the Pipeline Did |
|-------|----------|----------------------|
| **Baseline** | **25.9%** | Initial experiment code with 6 standard conditions (random search, Bayesian optimization, PPO, etc.). Basic meta-learning framework without domain-specific adaptations. |
| **Iter 1** | **81.2%** (+55.3 pts) | **Major architectural redesign.** The pipeline identified that the baseline methods were generic RL algorithms ill-suited for meta-learning. It autonomously: (1) Replaced generic methods with domain-specific ones: `replay_meta`, `context_gated_replay`, `online_meta_sgd`, `adaptive_lr_meta`; (2) Implemented a two-layer neural encoder with MAML-style inner-loop adaptation; (3) Added context-gated experience replay that modulates replay intensity based on context similarity; (4) Introduced per-parameter meta-SGD learning rates. |
| **Iter 2** | **77.5%** (-3.7 pts) | **Failed experiment — automatically detected and recovered.** The pipeline attempted to simplify the architecture by replacing the deep encoder with a prototype network. This reduced model expressiveness and degraded performance. The pipeline automatically detected the regression and retained the Iter 1 code as the best version. |
| **Iter 3** | **93.4%** (+15.9 pts) | **Architecture refinement with regularization.** Learning from both the success of Iter 1 and the failure of Iter 2, the pipeline: (1) Adopted a linear classifier with proper gradient-based inner-loop adaptation (simpler than Iter 1's deep encoder but more expressive than Iter 2's prototypes); (2) Added L2 anchor regularization to prevent catastrophic forgetting during adaptation; (3) Implemented cosine similarity-based context gating (more robust than prototype-distance gating); (4) Increased seed count from 24 to 28 for more robust statistics; (5) Added new comparison conditions: `prototype_regularized_meta`, `drift_aware_meta`. |
| **Iter 4** | **93.4%** (converged) | Minor hyperparameter adjustments. Pipeline recognized convergence and stopped. |

**Key Insight:** The pipeline demonstrated the ability to **recover from a failed approach** (Iter 2's prototype networks) by synthesizing lessons from both successful (Iter 1) and failed (Iter 2) attempts to arrive at a superior solution (Iter 3).

---

## Case B: RLHF with Curriculum-Based Reward Shaping for LLM Alignment

**Research Topic:** Improving LLM alignment through reinforcement learning from human feedback, with a curriculum-based approach that gradually increases task difficulty.

**Metric:** 1 − alignment_error (higher = better). Represents how well the trained policy aligns with human preferences.

### Iteration Progression

| Round | Alignment | What the Pipeline Did |
|-------|-----------|----------------------|
| **Baseline** | **35.6%** | Vanilla PPO policy with linear reward function. Direct preference feedback from environment oracle. No learned reward model, no curriculum scheduling. |
| **Iter 1** | **35.6%** (no change) | Minor code modifications that did not affect performance. Pipeline correctly identified no improvement and continued iterating. |
| **Iter 2** | **61.6%** (+26.0 pts) | **Core algorithmic innovation.** The pipeline introduced three key components: (1) **Learned preference reward model** — a logistic regression model trained on preference pairs: P(prefer chosen \| feature delta), updated online with Adam optimizer; (2) **Reward mixing schedule** — gradually increases reliance on the learned reward model from 10% to 80% over training (coefficient ramp); (3) **Curriculum power shaping** — nonlinear difficulty progression (power=1.4) that gives the agent more time on easier problems before advancing. |
| **Iter 3** | **63.0%** (+1.4 pts) | **Multi-signal evaluation.** Added: (1) **Rank-normalized multi-action evaluation** — samples up to 4 actions per state and evaluates preference feedback for each, converting to rank-based scores in [-1, +1]; (2) **Direct reward regression head** — a second regression-based reward predictor using ridge regression, blended with the classification head; (3) **Policy EMA** — exponential moving average of policy parameters (decay=0.92) with anchor regularization for training stability. |
| **Iter 4** | **66.6%** (+3.6 pts) | **Confidence-aware reward integration.** Added: (1) **Confidence-gated reward** — measures learned reward model accuracy, then uses softmax entropy to modulate how much the reward signal influences actions; (2) **Mini-batch reward model updates** — trains on 3 randomly sampled past preference pairs per step (not just current); (3) **Margin bonus** — early-curriculum episodes receive extra reward shaping from preference margins (coef=0.18 × (1−level) × tanh(margin)). |

**Key Insight:** The pipeline demonstrated **incremental technical sophistication** — each iteration built upon the previous one by adding a specific, well-motivated technique. The progression from vanilla PPO → learned reward model → multi-signal evaluation → confidence gating mirrors how a human researcher would iteratively refine an RLHF system.

---

## What This Demonstrates

1. **Autonomous Problem Diagnosis:** The pipeline identifies *why* performance is limited (e.g., "generic RL methods are unsuitable for meta-learning") and proposes targeted solutions.

2. **Failure Recovery:** When an iteration produces worse results (Case A, Iter 2), the pipeline automatically detects the regression, retains the previous best version, and learns from the failure to produce a better solution in the next iteration.

3. **Progressive Refinement:** Rather than making random changes, the pipeline demonstrates cumulative improvement — each iteration builds on insights from previous ones (Case B: reward model → rank normalization → confidence gating).

4. **Domain-Appropriate Innovation:** The pipeline generates methods that are appropriate for the specific research domain (context-gated replay for meta-learning, preference reward models for RLHF), not just generic hyperparameter tuning.

5. **Convergence Detection:** The pipeline automatically recognizes when further iterations are unlikely to yield improvement and terminates, avoiding wasted computation.

---

## Data Sources

- Case A: `artifacts/rc-20260314-132748-0ec2c9/stage-13_v2/refinement_log.json`
- Case B: `artifacts/rc-20260314-132748-91c516/stage-13/refinement_log.json`
- Figure script: `scripts/plot_iteration_showcase.py`


================================================
FILE: docs/metaclaw-integration-plan.md
================================================
# MetaClaw × AutoResearchClaw 集成方案

> **Status**: ✅ **Implemented & Merged to main** (v0.3.0, 2026-03-16)
>
> **目标**: 将 MetaClaw 的持续学习能力（技能注入、技能进化、PRM 评分、RL 训练）接入 AutoResearchClaw 的 23 阶段研究流水线，提升端到端论文生成质量。

---

## 一、项目概览

| 项目 | 定位 | 核心能力 |
|------|------|----------|
| **AutoResearchClaw** | 全自主研究流水线（Idea → Paper） | 23 阶段 Pipeline、文献检索、实验执行、论文写作、引用验证 |
| **MetaClaw** | Agent 持续进化平台 | 技能注入（Skill Injection）、技能进化（Skill Evolution）、PRM 奖励评分、RL 微调、空闲调度器 |

**集成核心思路**: MetaClaw 作为 AutoResearchClaw 的 **LLM 增强层**，通过多层次赋能提升每个阶段的 LLM 输出质量，并建立从研究失败中持续学习的闭环。

---

## 二、架构设计

### 2.1 集成架构总览

```
┌──────────────────────────────────────────────────────┐
│              AutoResearchClaw Pipeline                │
│  Stage 1 → 2 → ... → 23                             │
│                                                      │
│  ┌─────────────┐    ┌──────────────────────────────┐ │
│  │ LLMClient   │───▶│ MetaClaw Integration Layer   │ │
│  │ (原有)       │    │ (新增 metaclaw_bridge 模块)   │ │
│  └─────────────┘    └──────────┬───────────────────┘ │
│                                │                     │
│  ┌─────────────┐    ┌──────────▼───────────────────┐ │
│  │ Evolution   │◀──▶│ Lesson ↔ Skill 双向桥接      │ │
│  │ (原有)       │    └─────────────────────────────┘ │
│  └─────────────┘                                     │
└──────────────────────────┬───────────────────────────┘
                           │
            ┌──────────────▼──────────────┐
            │     MetaClaw Proxy Server    │
            │     (FastAPI :30000)         │
            │                              │
            │  ┌────────────────────────┐  │
            │  │ SkillManager           │  │
            │  │ - 通用技能 (40+)        │  │
            │  │ - 研究专属技能 (新增)    │  │
            │  │ - 阶段映射技能检索      │  │
            │  └────────────────────────┘  │
            │                              │
            │  ┌────────────────────────┐  │
            │  │ SkillEvolver           │  │
            │  │ - 从失败中自动生成技能  │  │
            │  └────────────────────────┘  │
            │                              │
            │  ┌────────────────────────┐  │
            │  │ PRMScorer              │  │
            │  │ - 阶段输出质量评分      │  │
            │  └────────────────────────┘  │
            │                              │
            └──────────────┬──────────────┘
                           │
              ┌────────────▼────────────┐
              │   Upstream LLM API      │
              │   (OpenAI / Kimi / etc.) │
              └─────────────────────────┘
```

### 2.2 集成层次

| 层次 | 名称 | 改动范围 | 效果 |
|------|------|----------|------|
| **L1** | Proxy 透传 | 仅改配置 | AutoResearchClaw → MetaClaw Proxy → LLM，自动获得通用技能注入 |
| **L2** | 阶段感知技能 | 新增研究技能库 + 阶段映射 | 每个 Pipeline 阶段注入最相关的研究技能 |
| **L3** | Evolution 桥接 | 新增 bridge 模块 | AutoResearchClaw 失败教训 → MetaClaw 技能；双向学习闭环 |
| **L4** | PRM 质量门控 | 集成 PRMScorer | 在质量门控阶段（5/9/15/20）使用 PRM 提供客观评分 |
| **L5** | RL 持续训练 | MetaClaw RL 模式 | 从研究对话中持续微调模型（可选，需 GPU） |

---

## 三、详细任务分解

### Phase 0: 环境准备与分支管理

#### Task 0.1: 创建集成分支
```bash
cd /home/jqliu/projects/AutoResearchClaw
git checkout -b feat/metaclaw-integration
```
- 所有开发工作在此分支进行
- 定期 rebase main 保持同步

#### Task 0.2: MetaClaw 环境配置
```bash
cd /home/jqliu/projects/MetaClaw
python -m venv .venv
source .venv/bin/activate
pip install -e ".[evolve]"    # 安装核心 + 技能进化依赖
```
- 只安装 `skills_only` 模式所需依赖（不需要 GPU / RL）
- 如需 embedding 检索：`pip install -e ".[embedding]"`

#### Task 0.3: MetaClaw 基础配置
创建 `~/.metaclaw/config.yaml`:
```yaml
mode: skills_only

llm:
  provider: custom
  model_id: <与 AutoResearchClaw 相同的模型>
  api_base: <上游 LLM API 地址>
  api_key: <API Key>

proxy:
  port: 30000
  api_key: ""   # 内部调用，无需鉴权

skills:
  enabled: true
  dir: ~/.metaclaw/skills
  retrieval_mode: template
  top_k: 6
  task_specific_top_k: 10
  auto_evolve: true
```

#### Task 0.4: 验证 MetaClaw 代理可用
```bash
# 启动 MetaClaw
metaclaw start --mode skills_only

# 测试连通性
curl -X POST http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":50}'
```

---

### Phase 1: L1 — Proxy 透传接入（最小改动）

**目标**: 零代码改动，仅通过配置让 AutoResearchClaw 经由 MetaClaw 代理调用 LLM。

#### Task 1.1: 修改 AutoResearchClaw 配置

修改 `config.researchclaw.yaml`:
```yaml
llm:
  provider: "openai-compatible"
  base_url: "http://localhost:30000"   # 指向 MetaClaw 代理
  api_key_env: ""
  api_key: ""                          # MetaClaw 无需鉴权
  primary_model: "<原模型名>"           # MetaClaw 会透传到上游
  fallback_models: []
```

#### Task 1.2: 兼容性适配

在 `researchclaw/llm/client.py` 中处理 MetaClaw 可能返回的 503 状态码（权重更新中）：

**文件**: `researchclaw/llm/client.py`
**改动**: 将 503 加入可重试状态码列表

```python
# 原有
_RETRYABLE_STATUS = {429, 500, 502, 504}

# 改为
_RETRYABLE_STATUS = {429, 500, 502, 503, 504}
```

#### Task 1.3: 端到端冒烟测试
```bash
# 1. 启动 MetaClaw
metaclaw start --mode skills_only

# 2. 运行 AutoResearchClaw 短流程
researchclaw run --topic "test topic" --config config.yaml
```

**验证点**:
- [x] AutoResearchClaw 能正常调用 LLM
- [x] MetaClaw 日志显示技能注入
- [x] 输出质量与直连 LLM 相当或更好

**预期交付**: AutoResearchClaw 透过 MetaClaw 运行，自动获得通用技能加持。

---

### Phase 2: L2 — 研究专属技能库 + 阶段映射

**目标**: 为 AutoResearchClaw 的 23 个阶段创建专属技能，并实现精准注入。

#### Task 2.1: 创建研究专属技能

在 `~/.metaclaw/skills/` 下新增以下技能（每个技能一个目录 + `SKILL.md`）：

| 技能名 | 类别 | 适用阶段 | 内容要点 |
|--------|------|----------|----------|
| `literature-search-strategy` | research | 3, 4 | 查询扩展、布尔组合、避免过宽搜索 |
| `paper-relevance-screening` | research | 5 | 相关性评分标准、排除低质量源 |
| `knowledge-card-extraction` | research | 6 | 结构化提取模板：方法/结果/局限 |
| `research-gap-identification` | research | 7 | 聚类分析、空白识别、创新角度发现 |
| `hypothesis-formulation` | research | 8 | SMART 假设、可证伪性检查 |
| `experiment-design-rigor` | research | 9 | 对照组设计、消融实验、统计功效 |
| `hardware-aware-coding` | coding | 10 | GPU/CPU 适配、内存管理、batch size 选择 |
| `experiment-debugging` | coding | 12, 13 | NaN/Inf 检测、收敛诊断、梯度检查 |
| `statistical-analysis` | data_analysis | 14 | p 值计算、效应量、置信区间 |
| `research-pivot-decision` | research | 15 | 证据权衡、PROCEED/PIVOT 决策框架 |
| `academic-writing-structure` | communication | 16, 17 | IMRaD 结构、段落逻辑、术语一致性 |
| `peer-review-methodology` | communication | 18 | 审稿视角、方法论-证据一致性检查 |
| `citation-integrity` | research | 23 | 引用验证层次、防幻觉策略 |

每个技能文件格式：
```markdown
---
name: literature-search-strategy
description: Design effective literature search queries for academic research
category: research
---
# Literature Search Strategy

## When to Use
When designing search queries for arXiv, Semantic Scholar, or other academic databases.

## Steps
1. **Decompose research topic** into 3-5 core concepts
2. **Generate synonyms** for each concept (e.g., "reinforcement learning" → "RL", "reward-based learning")
3. **Combine with Boolean operators**: (concept1 OR synonym1) AND (concept2 OR synonym2)
4. **Add temporal filters**: Prefer recent 3 years for fast-moving fields
5. **Iterative refinement**: If >200 results, narrow; if <10, broaden

## Anti-Patterns
- Avoid single-keyword queries (too broad, causes timeout)
- Avoid overly specific queries that miss relevant work
- Never rely on a single database source
```

#### Task 2.2: 阶段-技能映射模块

**新增文件**: `researchclaw/metaclaw_bridge/stage_skill_map.py`

```python
"""Maps AutoResearchClaw pipeline stages to MetaClaw skill categories."""

# 每个阶段对应的 MetaClaw 任务类型 + 推荐注入的研究专属技能
STAGE_SKILL_MAP: dict[str, dict] = {
    "topic_init": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 4,
    },
    "problem_decompose": {
        "task_type": "research",
        "skills": ["research-gap-identification"],
        "top_k": 4,
    },
    "search_strategy": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 6,
    },
    "literature_collect": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 4,
    },
    "literature_screen": {
        "task_type": "research",
        "skills": ["paper-relevance-screening"],
        "top_k": 6,
    },
    "knowledge_extract": {
        "task_type": "research",
        "skills": ["knowledge-card-extraction"],
        "top_k": 4,
    },
    "synthesis": {
        "task_type": "research",
        "skills": ["research-gap-identification"],
        "top_k": 6,
    },
    "hypothesis_gen": {
        "task_type": "research",
        "skills": ["hypothesis-formulation"],
        "top_k": 6,
    },
    "experiment_design": {
        "task_type": "research",
        "skills": ["experiment-design-rigor"],
        "top_k": 6,
    },
    "code_generation": {
        "task_type": "coding",
        "skills": ["hardware-aware-coding"],
        "top_k": 6,
    },
    "resource_planning": {
        "task_type": "productivity",
        "skills": [],
        "top_k": 3,
    },
    "experiment_run": {
        "task_type": "automation",
        "skills": ["experiment-debugging"],
        "top_k": 4,
    },
    "iterative_refine": {
        "task_type": "coding",
        "skills": ["experiment-debugging"],
        "top_k": 6,
    },
    "result_analysis": {
        "task_type": "data_analysis",
        "skills": ["statistical-analysis"],
        "top_k": 6,
    },
    "research_decision": {
        "task_type": "research",
        "skills": ["research-pivot-decision"],
        "top_k": 4,
    },
    "paper_outline": {
        "task_type": "communication",
        "skills": ["academic-writing-structure"],
        "top_k": 4,
    },
    "paper_draft": {
        "task_type": "communication",
        "skills": ["academic-writing-structure"],
        "top_k": 6,
    },
    "peer_review": {
        "task_type": "communication",
        "skills": ["peer-review-methodology"],
        "top_k": 6,
    },
    "paper_revision": {
        "task_type": "communication",
        "skills": ["academic-writing-structure", "peer-review-methodology"],
        "top_k": 6,
    },
    "quality_gate": {
        "task_type": "research",
        "skills": ["peer-review-methodology"],
        "top_k": 4,
    },
    "knowledge_archive": {
        "task_type": "automation",
        "skills": [],
        "top_k": 2,
    },
    "export_publish": {
        "task_type": "automation",
        "skills": [],
        "top_k": 2,
    },
    "citation_verify": {
        "task_type": "research",
        "skills": ["citation-integrity"],
        "top_k": 4,
    },
}
```

#### Task 2.3: 阶段感知 HTTP Header 注入

修改 `researchclaw/llm/client.py`，在发送请求时附带阶段上下文 Header：

```python
# 在 _request() 方法中新增
headers["X-Session-Id"] = f"arc-{run_id}"
headers["X-AutoRC-Stage"] = stage_name    # 自定义 header，供 MetaClaw 日志追踪
headers["X-Turn-Type"] = "main"           # 确保触发技能注入
```

#### Task 2.4: MetaClaw 端自定义技能检索（可选增强）

如果需要更精准的阶段感知，可在 MetaClaw 的 `api_server.py` 中读取 `X-AutoRC-Stage` header，根据 `STAGE_SKILL_MAP` 调整 skill 检索策略。但这需要修改 MetaClaw 代码，可作为后续优化。

**预期交付**: 13 个研究专属技能 + 阶段映射配置，每个 Pipeline 阶段获得最相关的技能注入。

---

### Phase 3: L3 — Evolution ↔ Skill 双向桥接

**目标**: 让 AutoResearchClaw 的失败教训自动转化为 MetaClaw 技能，形成学习闭环。

#### Task 3.1: Lesson → Skill 转化器

**新增文件**: `researchclaw/metaclaw_bridge/lesson_to_skill.py`

**功能**:
1. 从 `evolution/lessons.jsonl` 读取高严重性教训（severity = "error"）
2. 按类别聚合同类失败
3. 调用 LLM 将教训批量转化为 MetaClaw 技能格式
4. 写入 `~/.metaclaw/skills/arc-xxx/SKILL.md`

**类别映射**:
```python
LESSON_CATEGORY_TO_SKILL_CATEGORY = {
    "SYSTEM": "automation",
    "EXPERIMENT": "coding",
    "WRITING": "communication",
    "ANALYSIS": "data_analysis",
    "LITERATURE": "research",
    "PIPELINE": "automation",
}
```

**转化 Prompt 模板**:
```
以下是自动化研究流水线中反复出现的失败教训。请将它们转化为可复用的技能指南。

失败教训:
{lessons_text}

请为每个关键失败模式生成一个技能，格式如下:
- name: 小写连字符命名 (如 arc-avoid-broad-queries)
- description: 一句话描述何时使用此技能
- category: {target_category}
- content: Markdown 格式的步骤指南 (5-10 行)
```

#### Task 3.2: Skill 效果回馈

**新增文件**: `researchclaw/metaclaw_bridge/skill_feedback.py`

**功能**:
1. 在每次 Pipeline 运行结束后，统计各阶段成功/失败
2. 关联当次运行中注入了哪些技能
3. 计算技能-成功率关联
4. 将低效技能标记，供 MetaClaw SkillEvolver 参考

**数据结构**:
```python
@dataclass
class SkillEffectivenessRecord:
    skill_name: str
    stage_name: str
    run_id: str
    stage_success: bool
    timestamp: str
```

存储位置: `evolution/skill_effectiveness.jsonl`

#### Task 3.3: 自动进化触发

在 `researchclaw/pipeline/executor.py` 的运行结束钩子中，添加:

```python
# Pipeline 完成后触发
async def _post_pipeline_hook(self, run_results: list[StageResult]):
    # 1. 提取教训
    lessons = extract_lessons(run_results)
    self.evolution_store.append_many(lessons)

    # 2. 将高严重性教训转化为技能 (如果 MetaClaw bridge 启用)
    if self.config.metaclaw_bridge.enabled:
        from researchclaw.metaclaw_bridge.lesson_to_skill import convert_lessons_to_skills
        new_skills = await convert_lessons_to_skills(
            lessons=[l for l in lessons if l.severity == "error"],
            llm=self.llm,
            skills_dir=self.config.metaclaw_bridge.skills_dir,
        )
        logger.info(f"Generated {len(new_skills)} new MetaClaw skills from run failures")
```

**预期交付**: 失败 → 教训 → 技能的自动闭环，每次运行都让系统变得更好。

---

### Phase 4: L4 — PRM 质量门控

**目标**: 在关键质量门控阶段使用 MetaClaw 的 PRM 评分器提供客观质量评估。

#### Task 4.1: PRM 评分器集成

**新增文件**: `researchclaw/metaclaw_bridge/prm_gate.py`

**功能**: 封装 MetaClaw PRMScorer，为 AutoResearchClaw 的质量门控提供评分。

```python
class ResearchPRMGate:
    """Uses MetaClaw's PRM scorer for objective quality assessment."""

    def __init__(self, prm_config: dict):
        self.scorer = PRMScorer(
            api_base=prm_config["api_base"],
            api_key=prm_config["api_key"],
            model=prm_config["model"],
            majority_votes=prm_config.get("votes", 3),
        )

    async def evaluate_stage_output(
        self,
        stage_name: str,
        instruction: str,
        output: str,
    ) -> float:
        """Returns -1.0 (fail), 0.0 (ambiguous), or 1.0 (pass)."""
        score = await self.scorer.score(instruction, output)
        return score
```

#### Task 4.2: 集成到质量门控阶段

在以下 4 个门控阶段添加 PRM 评分:

| 阶段 | 评分对象 | 评分指令 |
|------|----------|----------|
| Stage 5 (LITERATURE_SCREEN) | 筛选后的论文列表 | "评估文献筛选的相关性和覆盖度" |
| Stage 9 (EXPERIMENT_DESIGN) | 实验设计方案 | "评估实验设计的严谨性：对照组、消融实验、统计方法" |
| Stage 15 (RESEARCH_DECISION) | PROCEED/PIVOT 决策 | "评估决策依据的证据充分性" |
| Stage 20 (QUALITY_GATE) | 最终论文 | "评估论文的学术质量、创新性、方法论严谨性" |

**决策逻辑**:
```python
prm_score = await prm_gate.evaluate_stage_output(stage, instruction, output)

if prm_score == 1.0:
    # 通过，继续
    pass
elif prm_score == 0.0:
    # 模糊，使用原有逻辑决策
    pass
elif prm_score == -1.0:
    # 不通过，触发回退
    # Stage 5 → 回退到 Stage 4
    # Stage 9 → 回退到 Stage 8
    # Stage 15 → REFINE 或 PIVOT
    # Stage 20 → 回退到 Stage 16
```

#### Task 4.3: 配置项扩展

在 `config.researchclaw.yaml` 中新增:

```yaml
metaclaw_bridge:
  enabled: true
  proxy_url: "http://localhost:30000"
  skills_dir: "~/.metaclaw/skills"

  prm:
    enabled: true
    api_base: "https://api.openai.com/v1"   # PRM 评分用的 LLM API
    api_key_env: "PRM_API_KEY"
    model: "gpt-4o"
    votes: 3                                  # 多数投票次数
    gate_stages: [5, 9, 15, 20]              # 启用 PRM 的阶段

  lesson_to_skill:
    enabled: true
    min_severity: "error"                     # 仅转化 error 级别教训
    max_skills_per_run: 3                     # 每次运行最多生成 3 个新技能
```

**预期交付**: 关键阶段获得客观质量评分，降低低质量输出流入后续阶段的概率。

---

### Phase 5: L5 — RL 持续训练（可选）

> **注意**: 此阶段需要 GPU 和 Tinker/MinT 后端，如当前环境不具备可跳过。

**目标**: 利用研究流水线的对话数据持续微调模型。

#### Task 5.1: 切换 MetaClaw 到 MadMax 模式

```yaml
# ~/.metaclaw/config.yaml
mode: madmax

rl:
  enabled: true
  backend: tinker       # 或 mint
  model: <模型名>
  api_key: <Tinker API Key>
  batch_size: 4
  lora_rank: 32

scheduler:
  enabled: true
  sleep_start: "23:00"
  sleep_end: "07:00"
  idle_threshold_minutes: 30
```

#### Task 5.2: 会话生命周期管理

在 AutoResearchClaw 的 Pipeline runner 中管理 MetaClaw 会话:

```python
# Pipeline 开始时
headers["X-Session-Id"] = f"arc-{run_id}"

# Pipeline 结束时
headers["X-Session-Done"] = "true"  # 通知 MetaClaw 一次研究会话结束
```

这使 MetaClaw 能够在每次研究运行结束后触发技能进化和 RL 训练数据收集。

**预期交付**: 每次研究运行都成为模型改进的训练数据，长期持续提升。

---

## 四、阶段-技能映射总表

| 阶段 | 阶段名称 | MetaClaw 任务类型 | 注入技能 | top_k |
|------|----------|-------------------|----------|-------|
| 1 | TOPIC_INIT | research | literature-search-strategy | 4 |
| 2 | PROBLEM_DECOMPOSE | research | research-gap-identification | 4 |
| 3 | SEARCH_STRATEGY | research | literature-search-strategy | 6 |
| 4 | LITERATURE_COLLECT | research | literature-search-strategy | 4 |
| 5 | LITERATURE_SCREEN | research | paper-relevance-screening | 6 |
| 6 | KNOWLEDGE_EXTRACT | research | knowledge-card-extraction | 4 |
| 7 | SYNTHESIS | research | research-gap-identification | 6 |
| 8 | HYPOTHESIS_GEN | research | hypothesis-formulation | 6 |
| 9 | EXPERIMENT_DESIGN | research | experiment-design-rigor | 6 |
| 10 | CODE_GENERATION | coding | hardware-aware-coding | 6 |
| 11 | RESOURCE_PLANNING | productivity | — | 3 |
| 12 | EXPERIMENT_RUN | automation | experiment-debugging | 4 |
| 13 | ITERATIVE_REFINE | coding | experiment-debugging | 6 |
| 14 | RESULT_ANALYSIS | data_analysis | statistical-analysis | 6 |
| 15 | RESEARCH_DECISION | research | research-pivot-decision | 4 |
| 16 | PAPER_OUTLINE | communication | academic-writing-structure | 4 |
| 17 | PAPER_DRAFT | communication | academic-writing-structure | 6 |
| 18 | PEER_REVIEW | communication | peer-review-methodology | 6 |
| 19 | PAPER_REVISION | communication | academic-writing-structure, peer-review-methodology | 6 |
| 20 | QUALITY_GATE | research | peer-review-methodology | 4 |
| 21 | KNOWLEDGE_ARCHIVE | automation | — | 2 |
| 22 | EXPORT_PUBLISH | automation | — | 2 |
| 23 | CITATION_VERIFY | research | citation-integrity | 4 |

---

## 五、新增文件清单

```
AutoResearchClaw/
├── researchclaw/
│   └── metaclaw_bridge/           # 新增模块
│       ├── __init__.py
│       ├── config.py              # MetaClaw 集成配置
│       ├── stage_skill_map.py     # 阶段-技能映射
│       ├── lesson_to_skill.py     # 教训→技能转化器
│       ├── skill_feedback.py      # 技能效果追踪
│       ├── prm_gate.py            # PRM 质量门控
│       └── session.py             # MetaClaw 会话管理
├── docs/
│   └── metaclaw-integration-plan.md  # 本文档
└── tests/
    └── test_metaclaw_bridge/      # 集成测试
        ├── test_stage_skill_map.py
        ├── test_lesson_to_skill.py
        ├── test_prm_gate.py
        └── test_e2e_integration.py
```

**需修改的现有文件**:
| 文件 | 改动内容 |
|------|----------|
| `researchclaw/llm/client.py` | 添加 503 重试 + X-Session-Id/X-Turn-Type header |
| `researchclaw/config.py` | 新增 `metaclaw_bridge` 配置段 |
| `researchclaw/pipeline/executor.py` | 添加 post-pipeline hook 调用 lesson_to_skill |
| `config.researchclaw.example.yaml` | 添加 metaclaw_bridge 配置示例 |

---

## 六、实施路线图

```
Week 1: Phase 0 + Phase 1 (环境 + Proxy 透传)
  ├── Day 1-2: 环境配置、分支创建、MetaClaw 安装验证
  ├── Day 3-4: 配置修改、503 重试适配、冒烟测试
  └── Day 5:   端到端验证、记录基线指标

Week 2: Phase 2 (研究技能库)
  ├── Day 1-3: 编写 13 个研究专属技能 SKILL.md
  ├── Day 4:   实现阶段映射模块 + header 注入
  └── Day 5:   A/B 对比测试（有/无技能注入）

Week 3: Phase 3 (Evolution 桥接)
  ├── Day 1-2: 实现 lesson_to_skill 转化器
  ├── Day 3:   实现 skill_feedback 追踪
  ├── Day 4:   集成到 executor post-pipeline hook
  └── Day 5:   测试闭环：运行 → 失败 → 教训 → 技能 → 再运行改进

Week 4: Phase 4 (PRM 门控) + 收尾
  ├── Day 1-2: 实现 prm_gate + 集成到 4 个门控阶段
  ├── Day 3:   全流程端到端测试
  ├── Day 4:   性能调优、文档完善
  └── Day 5:   代码审查、合并准备
```

---

## 七、风险与缓解

| 风险 | 影响 | 缓解措施 |
|------|------|----------|
| MetaClaw 代理增加延迟 | 每次 LLM 调用额外 ~50ms | 可接受；技能注入带来的质量提升远超延迟代价 |
| 技能注入增加 prompt token 消耗 | 每次调用增加 ~500-1000 tokens | 控制 top_k；关键阶段多注入，非关键阶段少注入 |
| MetaClaw 上下文截断 | 长 prompt 被静默截断 | 配置 `max_context_tokens` ≥ 32000 |
| Lesson→Skill 转化质量不稳定 | 生成无效技能 | 限制每次运行最多 3 个新技能；人工审核 |
| MetaClaw 进程崩溃 | Pipeline 中断 | 在 LLMClient 中添加 fallback：MetaClaw 不可用时直连上游 |
| 分支冲突 | 合并困难 | 改动集中在新模块，对原有代码侵入极小 |

### 关键缓解：Fallback 机制

在 `researchclaw/llm/client.py` 中实现:

```python
async def _request(self, ...):
    try:
        # 优先走 MetaClaw 代理
        return await self._http_post(self.base_url, ...)
    except (ConnectionError, Timeout):
        if self.config.metaclaw_bridge.fallback_url:
            # 降级直连上游 LLM
            logger.warning("MetaClaw proxy unavailable, falling back to direct LLM")
            return await self._http_post(self.config.metaclaw_bridge.fallback_url, ...)
        raise
```

---

## 八、成功指标

| 指标 | 基线（无 MetaClaw） | 目标（集成后） | 测量方法 |
|------|---------------------|----------------|----------|
| Pipeline 完成率 | 现有水平 | +15% | 统计 Stage 15 PROCEED 率 |
| 实验代码首次运行成功率 | 现有水平 | +20% | 统计 Stage 12 无需 Stage 13 的比例 |
| 论文 PRM 评分 | — | ≥ 0.6 平均分 | Stage 20 PRM 评分统计 |
| 引用验证通过率 | 现有水平 | +10% | Stage 23 验证通过率 |
| 技能库增长 | 40 (MetaClaw 原有) | +13 (研究专属) + 自动进化 | 技能目录文件数 |

---

## 九、API 需求清单

集成过程中可能需要以下 API（请确认可用性）:

| API | 用途 | 是否必需 |
|-----|------|----------|
| OpenAI-compatible LLM API | AutoResearchClaw + MetaClaw 共用 | **必需** |
| PRM 评分用 LLM API | Phase 4 质量门控（可与上述相同） | Phase 4 需要 |
| Tinker/MinT API | Phase 5 RL 训练 | **可选** |
| arXiv API | AutoResearchClaw 文献检索（已有） | 已配置 |
| Semantic Scholar API | AutoResearchClaw 文献检索（已有） | 已配置 |

---

## 十、快速启动命令

完成集成后的典型使用流程:

```bash
# 1. 启动 MetaClaw 代理
cd /home/jqliu/projects/MetaClaw
source .venv/bin/activate
metaclaw start --mode skills_only --port 30000

# 2. 运行增强版 AutoResearchClaw
cd /home/jqliu/projects/AutoResearchClaw
git checkout feat/metaclaw-integration
researchclaw run \
  --topic "Your research idea" \
  --config config.researchclaw.yaml

# 3. 查看 MetaClaw 技能注入日志
metaclaw status

# 4. 查看新进化出的技能
ls ~/.metaclaw/skills/arc-*/
```


================================================
FILE: docs/next_phase_showcase_plan.md
================================================
# AutoResearchClaw — Phase 5: Showcase Website & Sample Papers

> Created: 2026-03-15
> Status: **Website Built** — static site deployed, showcase papers pending generation
> Prerequisites: Phase 3 regression tests complete, all fixes pushed to origin/main

---

## 1. Goals

1. **Generate representative showcase papers** across diverse research domains to demonstrate pipeline capabilities
2. **Build a static website** to publicly present AutoResearchClaw's pipeline, features, and sample outputs
3. **Establish a paper gallery** with downloadable PDFs and code for each showcase paper

---

## 2. Showcase Paper Generation

### 2.1 Test Case Selection Strategy

Select 5-6 topics across different ML subfields, difficulty levels, and experiment types to maximize diversity:

| # | Topic | Domain | Experiment Type | Why |
|---|-------|--------|-----------------|-----|
| S1 | "Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification" | CV + Training Strategy | CIFAR-10/100, standard benchmark | Accessible, clear baselines, testable |
| S2 | "Prompt-Length-Aware Routing for Mixture-of-LoRA Experts in Instruction-Following" | NLP + PEFT | QLoRA + Qwen-2.5-3B fine-tuning | Showcases LLM fine-tuning capability |
| S3 | "Graph Attention Networks with Learnable Edge Features for Molecular Property Prediction" | GNN + Chemistry | OGB-MolHIV benchmark | Cross-domain application |
| S4 | "Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control" | RL | MuJoCo locomotion tasks | Complex multi-algorithm comparison |
| S5 | "Spectral Normalization Effects on Mode Collapse in Conditional GANs for CIFAR-10" | Generative Models | GAN training on CIFAR-10 | Visual results + quantitative metrics |
| S6 | "Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift" | Domain Adaptation | CIFAR-10-C corruption benchmark | Practical, real-world relevance |

### 2.2 Selection Criteria for Showcase

From the generated papers, select 3-4 best ones based on:

- **Paper Quality Score**: >= 7/10 from the built-in quality assessment
- **Experiment Completeness**: All methods ran, ablations show differentiation
- **Visual Quality**: Charts are clean, metrics are meaningful
- **Topic Diversity**: No two showcase papers from the same subfield
- **Narrative Quality**: Clear story from motivation through results to conclusions

### 2.3 Configuration Template

```yaml
# showcase_config_template.yaml
llm:
  provider: "azure_openai"
  model: "gpt-5.1"
  max_tokens: 16384

experiment:
  backend: "docker"
  timeout_sec: 3600  # generous budget for quality
  docker:
    gpu_enabled: true
    memory_limit_mb: 40960
    network_policy: "full"

pipeline:
  target_conference: "iclr_2026"
  max_refinement_iterations: 3
  enable_code_review: true
```

---

## 3. Static Website Design

### 3.1 Technology Stack

| Component | Choice | Rationale |
|-----------|--------|-----------|
| Static Site Generator | **Astro** or **Next.js (static export)** | Modern, fast, Markdown-native |
| Styling | **Tailwind CSS** | Utility-first, rapid prototyping |
| Hosting | **GitHub Pages** or **Vercel** | Free, auto-deploy from repo |
| PDF Rendering | **PDF.js** embedded viewer | In-browser paper viewing |
| Domain | `autoresearchclaw.github.io` or custom | GitHub Pages default |

**Alternative (simpler):** Pure HTML/CSS/JS with no build step — suitable if we want zero dependencies and maximum portability.

### 3.2 Site Structure

```
/                          → Landing page (hero, pipeline overview, CTA)
/pipeline                  → Interactive pipeline visualization (23 stages)
/papers                    → Gallery of showcase papers
/papers/{paper-id}         → Individual paper page (PDF viewer + metadata)
/features                  → Feature highlights and comparison
/getting-started           → Quick start guide
```

### 3.3 Page Designs

#### Landing Page (`/`)
- **Hero section**: Logo, tagline ("Chat an Idea. Get a Paper."), demo GIF/video
- **Pipeline overview**: Animated or scrollable 23-stage diagram
- **Key stats**: "1039 tests passed", "23 autonomous stages", "GPU-accelerated experiments"
- **Paper carousel**: 3-4 showcase papers with thumbnails
- **CTA**: GitHub link, quickstart command

#### Pipeline Page (`/pipeline`)
- Interactive visualization of the 23-stage pipeline
- Each stage clickable → shows description, inputs, outputs, example
- Stage groups: Topic Discovery → Literature → Experiment Design → Code → Execution → Writing → Review
- Highlight: Docker sandbox, multi-agent review, citation verification

#### Paper Gallery (`/papers`)
- Grid/card layout with paper thumbnails
- Each card shows: title, topic domain, quality score badge, abstract preview
- Filter by domain (CV, NLP, RL, etc.)
- Sort by quality score

#### Individual Paper Page (`/papers/{id}`)
- Embedded PDF viewer (PDF.js)
- Metadata sidebar: topic, quality score, stages completed, runtime, GPU used
- Download buttons: PDF, LaTeX source, experiment code
- Quality assessment summary (strengths, weaknesses from internal review)
- Experiment charts gallery

#### Features Page (`/features`)
- Feature cards with icons:
  - Real literature search (arXiv + Semantic Scholar)
  - Docker-sandboxed experiments with GPU passthrough
  - Multi-agent peer review
  - Iterative refinement loop
  - Conference-ready LaTeX output
  - Hardware-aware experiment design
  - Citation verification
  - LLM fine-tuning support (QLoRA/LoRA)

### 3.4 Assets Needed

| Asset | Source | Status |
|-------|--------|--------|
| Logo | Existing (`image/logo.png`) | Done |
| Framework diagram | Existing (`image/framework.png`) | Done |
| Pipeline stage icons | Need to create | TODO |
| Paper thumbnails | Generate from LaTeX PDFs | TODO |
| Demo video/GIF | Screen recording of pipeline run | TODO |
| Quality score badges | SVG badges | TODO |

---

## 4. Repository Structure for Website

```
website/
├── public/
│   ├── papers/
│   │   ├── paper-01/
│   │   │   ├── paper.pdf
│   │   │   ├── paper.tex
│   │   │   ├── code/
│   │   │   ├── charts/
│   │   │   └── metadata.json
│   │   └── paper-02/
│   │       └── ...
│   └── assets/
│       ├── logo.png
│       ├── framework.png
│       └── icons/
├── src/
│   ├── pages/
│   │   ├── index.astro        (or .html)
│   │   ├── pipeline.astro
│   │   ├── papers/
│   │   │   ├── index.astro
│   │   │   └── [id].astro
│   │   ├── features.astro
│   │   └── getting-started.astro
│   ├── components/
│   │   ├── Header.astro
│   │   ├── PipelineStage.astro
│   │   ├── PaperCard.astro
│   │   └── QualityBadge.astro
│   ├── layouts/
│   │   └── Base.astro
│   └── styles/
│       └── global.css
├── astro.config.mjs
├── tailwind.config.js
└── package.json
```

---

## 5. Paper Metadata Format

Each showcase paper includes a `metadata.json`:

```json
{
  "id": "curriculum-learning-cifar",
  "title": "Curriculum Learning with Adaptive Difficulty Scheduling...",
  "domain": "Computer Vision",
  "tags": ["curriculum-learning", "image-classification", "CIFAR-10"],
  "quality_score": 7.5,
  "verdict": "accept",
  "target_conference": "ICLR 2026",
  "generated_date": "2026-03-15",
  "runtime_minutes": 45,
  "gpu": "NVIDIA RTX 6000 Ada (49GB)",
  "stages_completed": 23,
  "abstract": "...",
  "strengths": ["...", "..."],
  "weaknesses": ["...", "..."],
  "files": {
    "pdf": "paper.pdf",
    "latex": "paper.tex",
    "code": "code/",
    "charts": "charts/",
    "references": "references.bib"
  }
}
```

---

## 6. Implementation Timeline

| Phase | Task | Estimated Effort | Dependencies |
|-------|------|-----------------|--------------|
| 5.1 | Generate 5-6 showcase papers | 1 day (parallel runs) | Phase 3 complete |
| 5.2 | Review & select 3-4 best papers | 2 hours | 5.1 |
| 5.3 | Compile LaTeX → PDF for selected papers | 1 hour | 5.2 |
| 5.4 | Set up website repo structure | 1 hour | — |
| 5.5 | Build landing page + pipeline visualization | 4 hours | 5.4 |
| 5.6 | Build paper gallery + individual pages | 3 hours | 5.2, 5.4 |
| 5.7 | Build features page | 2 hours | 5.4 |
| 5.8 | Deploy to GitHub Pages | 30 min | 5.5-5.7 |
| 5.9 | Create demo video/GIF | 2 hours | Pipeline working |

**Total estimated**: ~2 days

---

## 7. Deployment Options

### Option A: GitHub Pages (Recommended)
- Free hosting on `autoresearchclaw.github.io`
- Auto-deploy via GitHub Actions on push to `website` branch or `docs/` folder
- No server costs, CDN included

### Option B: Vercel
- Free tier supports static sites
- Faster builds, preview deployments for PRs
- Custom domain support

### Option C: Netlify
- Similar to Vercel, free tier available
- Form handling if needed later

**Recommendation**: Start with GitHub Pages for simplicity, migrate to Vercel if we need preview deployments or custom domain.

---

## 8. Content Checklist

- [x] Finalize showcase paper topics (Section 2.1)
- [ ] Run all showcase experiments
- [ ] Review and select best 3-4 papers
- [ ] Compile PDFs from LaTeX
- [ ] Create paper metadata.json for each
- [x] Design pipeline visualization (interactive or static) — interactive click-to-expand
- [x] Write feature descriptions — 16 feature cards + comparison table
- [x] Create getting-started guide (adapted from README) — 7-step guide
- [ ] Record demo video/GIF
- [x] Build and deploy website — pure HTML/CSS, GitHub Pages via Actions
- [x] Test on mobile/tablet — responsive CSS with nav toggle
- [ ] Add analytics (optional, e.g., Plausible)

---

## 9. Open Questions

1. **Custom domain?** — Do we want a custom domain (e.g., `autoresearchclaw.com`) or is `github.io` sufficient?
2. **Video demo?** — Should we include a screen recording of a full pipeline run, or is a GIF of key stages enough?
3. **Interactive pipeline?** — Full interactive SVG/Canvas pipeline diagram vs. static image with tooltips?
4. **Paper format** — Show papers as embedded PDFs, or convert to HTML for better web rendering?
5. **Localization** — Website in English only, or mirror the multi-language READMEs?

---

## 10. Success Metrics

- At least 3 showcase papers with quality score >= 7/10
- Website loads in < 2 seconds on 3G connection
- All showcase paper PDFs downloadable
- Pipeline visualization accurately represents all 23 stages
- GitHub stars / traffic increase after website launch (track via GitHub Insights)


================================================
FILE: docs/pipeline_critical_fixes_v8.md
================================================
# Pipeline Critical Fixes V8 — 投稿级论文质量修复

## 目标
修复所有阻止 Pipeline 产出符合 AI 顶会投稿标准论文的问题。

---

## Tier 1: 阻断性问题（必须立即修复）

### T1.1 Title 提取 Bug
- **问题**: converter 从 markdown 提取 title 时，可能错误抓取表格标题（如 "Table 1 – Aggregate primary_metric across methods."）
- **文件**: `researchclaw/templates/converter.py`
- **修复**: 加入 `_TITLE_REJECT_RE` 和 `_METRIC_DUMP_RE` 正则，`_is_bad_title()` 过滤表格/图表/指标标题，回退到真正的论文标题
- **状态**: ✅ 已修复

### T1.2 Markdown Fence 泄漏到 LaTeX
- **问题**: LLM 输出的 ` ```markdown ` fence 没有被 converter 清除，直接出现在 .tex 中
- **文件**: `researchclaw/templates/converter.py`
- **修复**: 增加智能 fence 清洗，仅移除文档类 fence（markdown/latex/text/bibtex），保留代码 fence（python/java 等）
- **状态**: ✅ 已修复

### T1.3 Section 层级错误
- **问题**: LLM 输出用 `##` (H2) 作为主要章节标题，converter 将其映射为 `\subsection` 而非 `\section`
- **文件**: `researchclaw/templates/converter.py` + `researchclaw/prompts.py`
- **修复**: converter 自动检测 body 最低 heading level 并做 level shift（H2→`\section`），prompts 明确要求用 `#` 做主标题
- **状态**: ✅ 已修复

### T1.4 BibTeX journal 字段填 arXiv 分类代码
- **问题**: `journal = {cs.CY}` 应该是 arXiv preprint 格式，不是 journal 名
- **文件**: `researchclaw/literature/models.py`
- **修复**: 检测 arXiv category 模式 → 自动转换为 `journal = {arXiv preprint arXiv:XXXX.XXXXX}` 格式
- **状态**: ✅ 已修复

### T1.5 Abstract 长度失控 + 含原始变量名
- **问题**: Abstract ~500 词（应 150-250），且包含 `bayesian_optimization/primary_metric = 0.8607` 等原始键名
- **文件**: `researchclaw/templates/converter.py`
- **修复**: `check_paper_completeness()` 增加 abstract 长度检查（>300 词警告）和原始变量名检测
- **状态**: ✅ 已修复

---

## Tier 2: 高优先级（显著提升论文质量）

### T2.1 Quality Gate 真正执行
- **问题**: Stage 20 总是返回 DONE，verdict 从未被 runner 检查
- **文件**: `researchclaw/pipeline/executor.py`
- **修复**: `_execute_quality_gate()` 当 score < threshold 时返回 `StageStatus.FAILED`，增加 pass/fail 日志
- **状态**: ✅ 已修复

### T2.2 文献筛选过于激进
- **问题**: 87 篇候选 → 仅 5 篇通过（94% 拒绝率），会议需要 20-40 篇
- **文件**: `researchclaw/pipeline/executor.py`
- **修复**: keyword pre-filter 从 ≥2 放宽到 ≥1，最低保留数从 6 提高到 15，LLM 返回太少时自动补充
- **状态**: ✅ 已修复

### T2.3 跨域论文过滤
- **问题**: 量子计算、社会学论文混入 RL 论文的参考文献
- **文件**: `researchclaw/prompts.py`
- **修复**: 已在 V7 修复中通过 `literature_screen` prompt 强化领域匹配规则（P2/P6 fixes）
- **状态**: ✅ 已修复（V7）

### T2.4 图表 DPI 不达标
- **问题**: 全部 savefig 使用 dpi=150（会议要求 ≥300）
- **文件**: `researchclaw/experiment/visualize.py`
- **修复**: 所有 `dpi=150` → `dpi=300`（5 处）
- **状态**: ✅ 已修复

### T2.5 强制必需章节验证
- **问题**: NeurIPS/ICLR 要求 Limitations 章节 — 当前不检查
- **文件**: `researchclaw/templates/converter.py` + `researchclaw/prompts.py`
- **修复**: `check_paper_completeness()` 增加 Limitations 章节检测；`writing_structure` block 增加 MARKDOWN FORMATTING 规则
- **状态**: ✅ 已修复

---

## Tier 3: 高价值架构级修复

### T3.1 RESEARCH_DECISION 质量验证
- **问题**: _parse_decision() 仅提取关键词，不验证最低标准（≥2 baselines, ≥3 seeds 等）
- **文件**: `researchclaw/pipeline/executor.py`
- **修复**: 在 decision 提取后增加质量检查，验证决策文本是否提及 baselines/seeds/metrics，警告缺失项并写入 decision_structured.json
- **状态**: ✅ 已修复

### T3.2 FigureAgent 合并到当前分支
- **问题**: FigureAgent 代码在 main 分支但不在 feat/metaclaw-integration
- **修复**: `git checkout main -- researchclaw/agents/figure_agent/ researchclaw/agents/__init__.py researchclaw/agents/base.py tests/test_figure_agent.py`
- **状态**: ✅ 已修复

### T3.3 负面结果处理
- **问题**: 当方法表现不如 baseline 时，论文仍写成 positive contribution
- **文件**: `researchclaw/pipeline/executor.py` + `researchclaw/prompts.py`
- **修复**: `_detect_result_contradictions()` 已实现 NULL/NEGATIVE 结果检测，advisories 注入 paper_draft prompt 上下文；prompts 中 `hypothesis_gen`、`paper_draft`、`paper_revision` 均已包含 negative result 处理指导
- **状态**: ✅ 已修复（已有实现）

### T3.4 Citation Verify 改为阻断性
- **问题**: CITATION_VERIFY 在 NONCRITICAL_STAGES 中，失败不阻断导出
- **文件**: `researchclaw/pipeline/stages.py`
- **修复**: 从 NONCRITICAL_STAGES 移除 CITATION_VERIFY
- **状态**: ✅ 已修复

### T3.5 论文分段写作容错
- **问题**: 3 次 LLM 调用中任一超时，对应章节丢失
- **文件**: `researchclaw/pipeline/executor.py`
- **修复**: `_write_paper_sections()` 三次 LLM 调用均增加 `retries=1`（自动重试 1 次），仍失败则用 `[PLACEHOLDER]` 标记缺失章节，确保后续流程不中断
- **状态**: ✅ 已修复

---

## 额外修复

### T-extra.1 Agent Config 集成
- **问题**: feat/metaclaw-integration 分支缺少 CodeAgentConfig / BenchmarkAgentConfig / FigureAgentConfig
- **文件**: `researchclaw/config.py`
- **修复**: 添加三个 agent config dataclass 及其解析函数，集成到 ExperimentConfig
- **状态**: ✅ 已修复

---

## 完成记录

| 时间 | 修复项 | 状态 |
|------|--------|------|
| 2026-03-15 | T1.1 Title 提取 Bug | ✅ |
| 2026-03-15 | T1.2 Markdown Fence 泄漏 | ✅ |
| 2026-03-15 | T1.3 Section 层级错误 | ✅ |
| 2026-03-15 | T1.4 BibTeX journal 字段 | ✅ |
| 2026-03-15 | T1.5 Abstract 验证 | ✅ |
| 2026-03-15 | T2.1 Quality Gate 执行 | ✅ |
| 2026-03-15 | T2.2 文献筛选放宽 | ✅ |
| 2026-03-15 | T2.4 图表 DPI 升级 | ✅ |
| 2026-03-15 | T2.5 必需章节验证 | ✅ |
| 2026-03-15 | T3.2 FigureAgent 合并 | ✅ |
| 2026-03-15 | T3.4 Citation Verify 阻断性 | ✅ |
| 2026-03-15 | T-extra.1 Agent Config | ✅ |
| 2026-03-15 | T3.1 Decision 质量验证 | ✅ |
| 2026-03-15 | T3.3 负面结果处理 | ✅ (已有) |
| 2026-03-15 | T3.5 分段写作容错 | ✅ |

**已完成**: 15/15 (100%)


================================================
FILE: docs/rate_limit_fix_plan.md
================================================
# arXiv / 文献检索限流问题 — 调研报告与修复方案

> Created: 2026-03-15
> Status: **DONE** — All 7 tasks completed, 1117/1117 tests passing
> Severity: High — 直接影响用户体验和 Pipeline 稳定性

---

## 1. 问题描述

Pipeline 中多个阶段需要通过 API 调用外部论文数据库（arXiv、Semantic Scholar、OpenAlex），在高频请求时遭遇 **HTTP 429 (Too Many Requests)** 限流错误，导致文献检索失败或降级。

**受影响阶段：**

| 阶段 | 功能 | API 调用量 | 严重程度 |
|------|------|-----------|---------|
| Stage 4 | 文献收集 | ~12 次 (6 query × 2 source) | **高** — 直接影响论文质量 |
| Stage 8 | 假设新颖性检查 | ~8-12 次 | 中 — 非阻塞 |
| Stage 23 | 引用验证 | ~40-50 次 | **高** — 最密集的 API 调用 |

---

## 2. 根因分析

### 2.1 代码层面定位

| 文件 | 问题 | 影响 |
|------|------|------|
| `researchclaw/literature/arxiv_client.py` | 无显式 HTTP 429 检测 — URLError/OSError 统一捕获，无法区分限流和真正的网络错误 | 限流时无法做针对性处理 |
| `researchclaw/literature/arxiv_client.py` | 无熔断器 (Circuit Breaker) — S2 有但 arXiv 没有 | 连续 429 时仍不停重试 |
| `researchclaw/literature/arxiv_client.py` | 未解析 `Retry-After` 响应头 | 服务器建议的等待时间被忽略 |
| `researchclaw/literature/semantic_scholar.py` | 虽有熔断器，但 Stage 23 的密集调用仍可能触发 | 一旦熔断，所有后续 S2 请求被跳过 |
| `researchclaw/literature/verify.py` | Stage 23 逐条顺序验证 40+ 引用，每条间隔 1.5s | 总耗时 60-80s，集中 burst 可触发限流 |
| `researchclaw/literature/search.py` | OpenAlex 仅用于 L3 title search fallback，未作为主搜索源 | 浪费了最宽松的 API 额度 |

### 2.2 各 API 官方限流策略

| API | 限制 | 我们当前的间隔 | 是否合规 |
|-----|------|---------------|---------|
| **arXiv** | 1 request / 3 seconds | 3.1s (`_RATE_LIMIT_SEC`) | 合规，但无 429 重试 |
| **Semantic Scholar** (无 Key) | 共享池 5000/5min | 1.5s | 偏保守但可行 |
| **Semantic Scholar** (有 Key) | 1 req/s (可申请更高) | 0.3s | 合规 |
| **OpenAlex** (有 Key) | 10,000 list/day; 1,000 search/day | 仅 L3 fallback | 远未用满 |
| **CrossRef** | 50 req/s (polite pool) | 1.5s | 远低于上限 |

### 2.3 arXiv 特殊性

- arXiv 元数据每天午夜更新一次 → 同一查询 24h 内重复请求无意义
- arXiv 返回 HTTP 200 但内容为空 (phantom empty page) 是已知 bug
- arXiv ToS 明确要求：所有你控制的机器合计不超过 1 req/3s

---

## 3. 竞品方案调研

### 3.1 PaperClaw (guhaohao0991/PaperClaw)

| 方面 | 实现 |
|------|------|
| arXiv 搜索 | `urllib.request` + 3s 固定延迟，**无重试，无 429 处理** |
| S2 搜索 | `requests` + `_request_with_retry()`: 指数退避 (2^attempt)，3 次重试 |
| S2 缓存 | 文件 JSON 缓存，按类型 TTL（论文 7 天，作者 30 天，引用 1 天） |
| arXiv 缓存 | **无** — 每次直接调 API |

**可借鉴**：S2 缓存按类型差异化 TTL 的思路。

### 3.2 Sibyl Research System (Sibyl-Research-Team/sibyl-research-system)

| 方面 | 实现 |
|------|------|
| arXiv 搜索 | 通过 `arxiv-mcp-server` MCP 工具，依赖 `arxiv` Python 库的 3s delay |
| 429 处理 | **代码层面无** — 依赖 CLAUDE.md 指令让 LLM agent 行为级重试 |
| 体量控制 | 刻意限制在 15-30 篇论文，"速度优先" |
| S2 | **未使用** |

**可借鉴**：控制搜索体量、不过度爬取的理念。

### 3.3 Idea2Paper (AgentAlphaAGI/Idea2Paper)

| 方面 | 实现 |
|------|------|
| 文献来源 | **完全离线** — 预构建 ICLR Knowledge Graph，运行时不调任何论文 API |
| 重试策略 | `urllib3.util.retry.Retry(total=3, backoff_factor=2, status_forcelist=[429,500,502,503,504])` |
| 降级策略 | Embedding 失败后降级为 Jaccard 相似度 |

**可借鉴**：`urllib3.Retry` 的干净实现；embedding 失败的优雅降级。

### 3.4 对比总结

| 项目 | arXiv 429 处理 | S2 429 处理 | 缓存 | 备选源 |
|------|---------------|-------------|------|--------|
| **AutoResearchClaw (当前)** | 无显式处理 | 熔断器 ✓ | 7 天 TTL ✓ | OpenAlex (仅 L3) |
| PaperClaw | 无 | 指数退避 | S2 有/arXiv 无 | 无 |
| Sibyl | 无 (靠 LLM) | 未使用 | 论文下载缓存 | WebSearch |
| Idea2Paper | 不涉及 | 不涉及 | 离线 KG | 不涉及 |

---

## 4. 综合解决方案

### 4.1 方案总览

```
┌─────────────────────────────────────────────────────┐
│                   防御层次                            │
├─────────────────────────────────────────────────────┤
│ L1: 智能限速器 (Adaptive Rate Limiter)              │
│     - 根据 API 类型自动调节请求间隔                   │
│     - 解析 Retry-After 响应头                        │
│                                                     │
│ L2: 熔断器 (Circuit Breaker)                        │
│     - arXiv 也加熔断器 (参考 S2 实现)                 │
│     - 三态切换: CLOSED → OPEN → HALF_OPEN            │
│                                                     │
│ L3: 多源降级 (Source Fallback)                       │
│     - arXiv 限流 → 切换 OpenAlex/S2                  │
│     - S2 限流 → 切换 OpenAlex/arXiv                  │
│     - 全部限流 → 返回缓存结果                         │
│                                                     │
│ L4: 结果缓存 (Cache Layer)                          │
│     - 24h TTL for arXiv (每天只更新一次)              │
│     - 差异化 TTL (论文元数据 vs 搜索结果)              │
│     - 引用验证结果永久缓存                            │
│                                                     │
│ L5: 请求优化 (Request Optimization)                 │
│     - S2 batch API 批量查询                          │
│     - 合并重复查询                                   │
│     - OpenAlex 提升为主搜索源                         │
└─────────────────────────────────────────────────────┘
```

---

## 5. 实施任务列表

### Task 1: arXiv 客户端增强 — 显式 429 处理 + 熔断器

**文件**: `researchclaw/literature/arxiv_client.py`

**改动**:
- [x]1.1 改用 `urllib.request.urlopen` 的 `HTTPError` 子类捕获，区分 429 和其他错误
- [x]1.2 解析 `Retry-After` 响应头，优先使用服务器建议的等待时间
- [x]1.3 添加 arXiv 熔断器（复用 S2 的三态模式）: 3 次连续 429 → OPEN (180s cooldown)
- [x]1.4 增加 `_RATE_LIMIT_SEC` 动态调整: 收到 429 后临时提升到 5s，成功后恢复 3.1s

**预期效果**: arXiv 429 错误从"静默失败/重试 3 次放弃"变为"智能等待 + 熔断保护"

### Task 2: OpenAlex 提升为主搜索源

**文件**: `researchclaw/literature/search.py`, 新建 `researchclaw/literature/openalex_client.py`

**改动**:
- [x]2.1 新建 `openalex_client.py`: 封装 OpenAlex Works API (`https://api.openalex.org/works`)
  - 支持 `title.search` / `default.search` 两种查询模式
  - 字段映射到 `Paper` 数据类 (title, abstract, year, venue, citation_count, authors, doi, arxiv_id)
  - 配置 polite pool email (`researchclaw@users.noreply.github.com`)
  - 指数退避 + 3 次重试
- [x]2.2 在 `search.py` 的 `search_papers()` 中注册 OpenAlex 为第三个源
- [x]2.3 调整 `search_papers_multi_query()` 的源顺序策略:
  - 默认: OpenAlex → Semantic Scholar → arXiv
  - 任一源 429 → 跳过该源，增加其他源的 limit
- [x]2.4 在 `config.researchclaw.example.yaml` 中添加 `openalex_email` 配置项

**预期效果**: 文献检索默认走 OpenAlex (10K/day)，arXiv 和 S2 作为补充和验证，大幅降低 429 风险

### Task 3: 搜索结果缓存增强

**文件**: `researchclaw/literature/cache.py`

**改动**:
- [x]3.1 arXiv 搜索结果 TTL 从 7 天改为 24 小时（arXiv 每天午夜更新一次）
- [x]3.2 添加"源级别"缓存策略: 如果 arXiv 缓存存在且 <24h，直接返回而不请求 API
- [x]3.3 缓存命中时记录日志 `[cache] HIT query=... source=... age=...`

**预期效果**: 同一 Pipeline 运行中不会重复请求同一查询，跨运行也可复用 24h 内的结果

### Task 4: S2 batch API + 去重优化

**文件**: `researchclaw/literature/semantic_scholar.py`

**改动**:
- [x]4.1 新增 `batch_fetch_papers(paper_ids: list[str]) -> list[Paper]`
  - 使用 `POST /graph/v1/paper/batch` 端点
  - 一次最多 500 个 ID（S2 限制）
  - 单次请求替代 N 次请求
- [x]4.2 在 Stage 23 引用验证中使用 batch API: 先收集所有有 S2 ID 的引用，一次性批量获取

**预期效果**: Stage 23 的 S2 API 调用从 ~20 次降至 1-2 次

### Task 5: Stage 23 引用验证并行化 + 智能调度

**文件**: `researchclaw/literature/verify.py`

**改动**:
- [x]5.1 按源分组，相同源的验证串行（遵守限速），不同源的验证并行
  - L1 (arXiv) 和 L2 (CrossRef/DataCite) 和 L3 (OpenAlex) 可并行
- [x]5.2 引用验证缓存标记为"永久有效"（已验证的论文不会变）
- [x]5.3 优先使用 DOI → OpenAlex 验证（比 arXiv API 限制宽松得多），L1 arXiv 降为备选

**预期效果**: Stage 23 耗时从 60-80s 降至 20-30s，arXiv API 调用减少 50%+

### Task 6: 用户反馈 + 日志改善

**文件**: `researchclaw/pipeline/executor.py`, `researchclaw/literature/search.py`

**改动**:
- [x]6.1 文献检索阶段添加进度日志: `[literature] Searching OpenAlex... (1/3 sources)`
- [x]6.2 429 错误时输出友好提示: `[rate-limit] arXiv rate limit hit, switching to OpenAlex...`
- [x]6.3 最终搜索统计: `[literature] Found 47 papers (OpenAlex: 28, S2: 12, arXiv: 7, cache: 23 hits)`

**预期效果**: 用户能清楚看到搜索进度和限流处理过程，不再困惑

### Task 7: 测试覆盖

**文件**: `tests/test_rc_literature.py`

**改动**:
- [x]7.1 arXiv 429 + Retry-After header 解析测试
- [x]7.2 arXiv 熔断器三态切换测试
- [x]7.3 OpenAlex 客户端正常搜索 + 429 退避测试
- [x]7.4 多源降级测试: 模拟 arXiv 429 → 自动切换到 OpenAlex
- [x]7.5 S2 batch API 测试
- [x]7.6 缓存 24h TTL 测试

---

## 6. 优先级排序

| 优先级 | 任务 | 理由 | 预计工时 |
|--------|------|------|---------|
| **P0** | Task 1: arXiv 429 显式处理 + 熔断器 | 直接修复当前 crash 问题 | 30min |
| **P0** | Task 3: 缓存 TTL 调整 | 零成本减少请求量 | 15min |
| **P1** | Task 2: OpenAlex 主搜索源 | 根本性降低 arXiv 依赖 | 1.5h |
| **P1** | Task 6: 用户反馈日志 | 提升用户体验 | 20min |
| **P2** | Task 4: S2 batch API | 优化 Stage 23 | 45min |
| **P2** | Task 5: Stage 23 并行化 | 性能优化 | 1h |
| **P3** | Task 7: 测试覆盖 | 质量保障 | 1h |

**总计**: ~5 小时

---

## 7. 实施进度

| 任务 | 状态 | 完成时间 | 备注 |
|------|------|---------|------|
| Task 1: arXiv 429 + 熔断器 | [x] 完成 | 2026-03-15 | 三态熔断器 + Retry-After 解析 + 动态限速 |
| Task 2: OpenAlex 主搜索源 | [x] 完成 | 2026-03-15 | 新建 openalex_client.py，搜索源顺序 OA→S2→arXiv |
| Task 3: 缓存增强 | [x] 完成 | 2026-03-15 | 按源差异化 TTL (arXiv 24h, S2/OA 3d, verify 永久) |
| Task 4: S2 batch API | [x] 完成 | 2026-03-15 | batch_fetch_papers() POST 批量端点，500 ID/batch |
| Task 5: Stage 23 优化 | [x] 完成 | 2026-03-15 | 验证顺序 DOI→CrossRef→OpenAlex→arXiv→S2，差异化延迟 |
| Task 6: 用户反馈日志 | [x] 完成 | 2026-03-15 | Stage 4/23 进度日志 + 源统计 |
| Task 7: 测试覆盖 | [x] 完成 | 2026-03-15 | +14 新测试 (熔断器×5, OpenAlex×4, 降级×1, TTL×2, 现有修复×2) |

---

## 附录 A: API 速率限制速查表

| API | 端点 | 免费限制 | 认证限制 | 我们的使用量 |
|-----|------|---------|---------|------------|
| arXiv | `export.arxiv.org/api/query` | 1 req / 3s | 无认证选项 | ~20 req/run |
| Semantic Scholar | `api.semanticscholar.org/graph/v1` | 共享池 5K/5min | 1 req/s (API key) | ~30 req/run |
| OpenAlex | `api.openalex.org/works` | 10K list/day, 1K search/day | 同左 (polite pool) | 待启用 |
| CrossRef | `api.crossref.org/works` | 50 req/s (polite) | 同左 | ~15 req/run |
| DataCite | `api.datacite.org/dois` | 无明确限制 | — | ~5 req/run |

## 附录 B: 参考实现

### arXiv 显式 429 处理（目标代码）

```python
try:
    resp = urllib.request.urlopen(req, timeout=_TIMEOUT_SEC)
except urllib.error.HTTPError as exc:
    if exc.code == 429:
        retry_after = exc.headers.get("Retry-After")
        wait = int(retry_after) if retry_after else _RATE_LIMIT_SEC * (2 ** attempt)
        _cb_on_429()  # 通知熔断器
        time.sleep(wait + random.uniform(0, wait * 0.2))
        continue
    raise
```

### OpenAlex 搜索客户端（目标签名）

```python
def search_openalex(
    query: str,
    limit: int = 50,
    year_min: int | None = None,
    email: str = "researchclaw@users.noreply.github.com",
) -> list[Paper]:
    """Search OpenAlex Works API with polite pool access."""
    ...
```

### S2 Batch API（目标签名）

```python
def batch_fetch_papers(
    paper_ids: list[str],
    fields: str = "paperId,title,abstract,year,venue,citationCount,authors,externalIds",
) -> list[dict]:
    """Batch fetch paper details via POST /graph/v1/paper/batch."""
    ...
```


================================================
FILE: docs/sandbox_environment_fix_plan.md
================================================
# AutoResearchClaw — Docker Sandbox Environment Enhancement Plan

> Created: 2026-03-15
> Status: **DONE** — All 10 issues fixed, 1128/1128 tests passing
> Priority: **CRITICAL** — Without these fixes, experiments fall back to synthetic data, producing meaningless papers

---

## 0. Problem Statement

When a user requests experiments on real datasets (e.g., ImageNet, OGB, HuggingFace benchmarks),
the pipeline fails to use them because:

1. The LLM is **not told** which packages are actually available in the Docker image
2. The Docker sandbox **cannot install packages at runtime** (default `network_policy: "none"`)
3. Phase 1 pip install is **broken** — packages install in Container A, but experiment runs in Container B (packages lost)
4. Only **4 small datasets** are pre-cached (CIFAR-10, FashionMNIST); prompt incorrectly claims CIFAR-100 and MNIST are cached too
5. **No dataset download mechanism** exists — no setup phase for downloading data before experiment execution
6. The Dockerfile is **missing key ML packages** (timm, einops, torchmetrics, ogb, etc.)

**Result:** The LLM generates `torch.randn()` "ImageNet-like" synthetic data as a fallback, making all experiment results meaningless.

---

## 1. Reference Solutions Analysis

### 1.1 AI-Scientist (SakanaAI)
- **Approach:** "Fat image" — ALL dependencies and datasets baked into Docker image at build time
- **Dataset handling:** Pre-download scripts run during `docker build` (enwik8, shakespeare, text8)
- **Runtime pip install:** None — not supported
- **Network:** No isolation (user's responsibility)
- **Lesson:** Pre-caching is the most reliable strategy for reproducibility

### 1.2 AutoResearch (Karpathy)
- **Approach:** End-to-end automation in local environment
- **Dataset handling:** Direct downloads via standard APIs
- **Lesson:** Simplicity — don't over-engineer isolation if it breaks functionality

### 1.3 OpenHands (formerly OpenDevin)
- **Approach:** Most sophisticated sandbox architecture
- **Key feature:** `runtime_extra_deps` config for pre-declaring packages
- **Agent autonomy:** Agent can run `pip install` via `CmdRunAction` inside the container
- **Three-tag Docker image caching system** for build optimization
- **Lesson:** Allow agent (LLM) to declare and install its own dependencies

### 1.4 MLCommons Training Benchmarks
- **Approach:** Host-download, container-mount pattern
- **Three phases:** Download on host → Build Docker image → Mount data volumes
- **Lesson:** Large datasets should NEVER be inside Docker images — always volume-mount

### 1.5 Docker Desktop Sandboxes
- **Network policies:** HTTP/HTTPS proxy allowlists per host
- **Example:** Allow `*.pypi.org`, `*.huggingface.co`, `download.pytorch.org`, block everything else
- **Lesson:** Fine-grained network control is better than all-or-nothing

---

## 2. Issues Identified

### E1: `pkg_hint` doesn't list most installed packages [CRITICAL]
**File:** `researchclaw/pipeline/executor.py:2062-2073`
**Current:**
```python
pkg_extras = ", torchdiffeq, gymnasium, networkx, and pip-installable packages"
# Resulting prompt: "AVAILABLE PACKAGES: Python stdlib, numpy, torch, sklearn, scipy, pandas, torchdiffeq, gymnasium, networkx"
```
**Missing from list:** torchvision, torchaudio, matplotlib, seaborn, PyYAML, tqdm
**Impact:** LLM thinks torchvision isn't available → avoids it → generates synthetic data instead of using CIFAR-10

### E2: Phase 1/Phase 2 container isolation BUG [CRITICAL]
**File:** `researchclaw/experiment/docker_sandbox.py:169-181, 317-354`
**Bug:** Phase 1 runs `docker run --rm` (installs packages, then container is removed). Phase 2 runs a NEW `docker run --rm` from the same base image. Packages installed in Phase 1 are **completely lost** because the container was deleted.
**Impact:** `auto_install_deps` and `pip_pre_install` features are entirely non-functional. Any package not in the base Docker image is unavailable during experiment execution.

### E3: Default `network_policy` is `"none"` [HIGH]
**File:** `researchclaw/config.py:163`
**Current:** `network_policy: str = "none"`
**Impact:** Even with `auto_install_deps: True`, Phase 1 never executes because it requires `network_policy == "pip_only"`. No runtime installation or download is possible by default.

### E4: No dataset download phase [HIGH]
**File:** `researchclaw/experiment/docker_sandbox.py` (no implementation exists)
**Missing:** There is no mechanism for downloading datasets before experiment execution. The only datasets available are the 4 pre-cached in the Docker image.
**Impact:** Experiments requiring any dataset beyond CIFAR-10/100, MNIST, FashionMNIST cannot use real data.

### E5: Pre-cached dataset list inconsistent [MEDIUM]
**File:** `researchclaw/docker/Dockerfile:27-30` vs `researchclaw/prompts.py:328-332`
**Bug:** Dockerfile only pre-caches CIFAR-10 and FashionMNIST, but the `dataset_guidance` prompt also lists CIFAR-100 and MNIST as pre-cached. If LLM uses `download=False` for CIFAR-100/MNIST, it will get a FileNotFoundError.

### E6: Dockerfile missing commonly-needed ML packages [MEDIUM]
**File:** `researchclaw/docker/Dockerfile:20-24`
**Missing packages:**
- Vision: `timm`, `albumentations`, `kornia`, `Pillow`
- General ML: `einops`, `torchmetrics`, `lightning`
- Graph: `ogb`, `torch-geometric` (optional, large)
- HuggingFace: `transformers`, `datasets`, `accelerate`, `peft` (needed for LLM fine-tuning tasks)
- Utilities: `h5py`, `tensorboard`, `wandb`

### E7: `dataset_guidance` prompt is misleading [MEDIUM]
**File:** `researchclaw/prompts.py:333`
**Current:** "For other torchvision datasets: use `download=True` (network available during setup)"
**Reality:** With default `network_policy: "none"`, network is NOT available at any point. This guidance causes the LLM to generate code with `download=True` that fails with DNS resolution errors.

### E8: No `requirements.txt` generation or processing [LOW]
**File:** `researchclaw/pipeline/executor.py` (code_generation stage)
**Missing:** The LLM is not guided to declare its package requirements. No `requirements.txt` is generated alongside experiment code.

### E9: No LLM-generated setup script support [LOW]
**File:** `researchclaw/experiment/docker_sandbox.py`
**Missing:** No support for a `setup.py` or `download_data.py` script that runs before `main.py` to prepare the environment (download datasets, install packages, etc.).

### E10: No dataset registry / availability matrix [LOW]
**File:** `researchclaw/prompts.py`
**Missing:** The LLM has no knowledge of which datasets are downloadable (and how), which are too large, and what fallback alternatives exist. It should know: "ImageNet is 168GB — use Tiny-ImageNet (200 classes, 500/class) or ImageNet-1k subset instead."

---

## 3. Solution Design

### Architecture: Unified Container with Setup Phase

Replace the broken two-container model with a **single container** running a **wrapper entrypoint script** that handles three phases:

```
┌─────────────────────────────────────────────────────────────────┐
│  Single Docker Container                                         │
│                                                                  │
│  Phase 0: pip install (requirements.txt + auto-detected deps)    │
│           ↓ (network enabled for this phase)                     │
│  Phase 1: setup.py (dataset download, preprocessing)             │
│           ↓ (network enabled for this phase)                     │
│  Phase 2: main.py (experiment execution)                         │
│           ↓ (network optionally disabled via iptables)           │
│                                                                  │
│  Network policy:                                                 │
│  - "none": skip Phase 0/1, no network in Phase 2                │
│  - "setup_only" (NEW default): network in Phase 0+1, disabled   │
│    via iptables before Phase 2                                   │
│  - "pip_only": network in Phase 0 only                          │
│  - "full": network throughout                                    │
└─────────────────────────────────────────────────────────────────┘
```

### Key Design Decisions

| Decision | Choice | Rationale |
|----------|--------|-----------|
| Single vs. multi container | Single | Fixes E2; packages survive between phases |
| Network isolation method | iptables drop inside container | Docker doesn't support mid-run network changes |
| Default network policy | `"setup_only"` | Allows pip install + dataset download, but experiment runs isolated |
| Large dataset strategy | Volume-mount + download + fallback hierarchy | ImageNet on host, smaller sets downloadable |
| Entrypoint | Wrapper bash script → python scripts | Separates concerns, easy to debug |
| LLM guidance | Dataset availability matrix in prompt | LLM makes informed decisions about data |

---

## 4. Implementation Plan

### Task E1: Fix `pkg_hint` to list all installed packages [CRITICAL]
**File:** `researchclaw/pipeline/executor.py:2062`
**Change:** Update `pkg_extras` string for docker mode to include ALL pre-installed packages.
```python
# Before:
pkg_extras = ", torchdiffeq, gymnasium, networkx, and pip-installable packages"

# After:
pkg_extras = (
    ", torchvision, torchaudio, matplotlib, seaborn, scipy, "
    "tqdm, torchdiffeq, gymnasium, networkx, PyYAML, "
    "and pip-installable packages (auto-detected from imports)"
)
```
**Effort:** 10 min

### Task E2: Fix Phase 1/Phase 2 container isolation — single-container execution [CRITICAL]
**Files:**
- `researchclaw/docker/entrypoint.sh` (NEW) — wrapper script
- `researchclaw/docker/Dockerfile` (MODIFY) — new entrypoint
- `researchclaw/experiment/docker_sandbox.py` (MODIFY) — refactor execution model

**E2.1: Create wrapper entrypoint script** (`researchclaw/docker/entrypoint.sh`)
```bash
#!/bin/bash
set -e

WORKSPACE="/workspace"
SETUP_ONLY_NETWORK="${RC_SETUP_ONLY_NETWORK:-0}"

# --- Phase 0: Install additional pip packages ---
if [ -f "$WORKSPACE/requirements.txt" ]; then
    echo "[RC] Phase 0: Installing packages from requirements.txt..."
    pip install --no-cache-dir --break-system-packages \
        -r "$WORKSPACE/requirements.txt" 2>&1 | tail -5
    echo "[RC] Phase 0: Package installation complete."
fi

# --- Phase 1: Run setup script (dataset downloads, etc.) ---
if [ -f "$WORKSPACE/setup.py" ]; then
    echo "[RC] Phase 1: Running setup.py (dataset download/preparation)..."
    python3 -u "$WORKSPACE/setup.py"
    echo "[RC] Phase 1: Setup complete."
fi

# --- Network cutoff (if setup_only policy) ---
if [ "$RC_SETUP_ONLY_NETWORK" = "1" ]; then
    echo "[RC] Disabling network for experiment phase..."
    # Drop all outbound traffic (requires NET_ADMIN capability)
    iptables -A OUTPUT -j DROP 2>/dev/null || \
    ip route del default 2>/dev/null || \
    echo "[RC] Warning: Could not disable network (no NET_ADMIN cap)"
fi

# --- Phase 2: Run experiment ---
ENTRY_POINT="${1:-main.py}"
echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
exec python3 -u "$WORKSPACE/$ENTRY_POINT"
```

**E2.2: Update Dockerfile** to use new entrypoint
```dockerfile
# Add entrypoint script
COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

# Change entrypoint from python3 to wrapper script
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]
```

Note: For `setup_only` network policy, the container needs `--cap-add=NET_ADMIN` for iptables,
or we use `ip route del default` as a fallback (doesn't require capabilities).

**E2.3: Refactor `docker_sandbox.py`**
- Remove separate `_install_deps()` method
- Update `_build_run_command()` to pass entry point as CMD argument
- Handle `requirements.txt` generation in staging dir
- Add `RC_SETUP_ONLY_NETWORK` env var for network cutoff

**E2.4: Update `_build_run_command()` for new model**
```python
def _build_run_command(self, staging_dir, *, entry_point, container_name, network_disabled):
    cfg = self.config
    cmd = [
        "docker", "run",
        "--name", container_name,
        "--rm",
        "-v", f"{staging_dir}:/workspace",
        "-w", "/workspace",
        f"--memory={cfg.memory_limit_mb}m",
        f"--shm-size={cfg.shm_size_mb}m",
    ]

    # For setup_only: container starts with network, then disables it internally
    if cfg.network_policy == "setup_only":
        cmd.extend(["-e", "RC_SETUP_ONLY_NETWORK=1"])
        cmd.extend(["--cap-add=NET_ADMIN"])
        # Don't add --network none (need network for setup phases)
    elif cfg.network_policy == "none":
        cmd.extend(["--network", "none"])
    # "pip_only" and "full" keep normal network

    # ... (volume mounts, GPU, etc. unchanged) ...

    # New: generate requirements.txt from auto-detected deps
    if cfg.network_policy in ("pip_only", "setup_only", "full"):
        if cfg.auto_install_deps or cfg.pip_pre_install:
            self._write_requirements_txt(staging_dir)

    cmd.append(cfg.image)
    cmd.append(entry_point)  # Passed as CMD to entrypoint.sh
    return cmd
```

**Effort:** 3-4 hours

### Task E3: Change default `network_policy` to `"setup_only"` [HIGH]
**File:** `researchclaw/config.py:163`
```python
# Before:
network_policy: str = "none"

# After:
network_policy: str = "setup_only"
```
Also update docstring and config examples.
**Effort:** 15 min

### Task E4: Add LLM-generated `setup.py` for dataset downloads [HIGH]
**Files:**
- `researchclaw/prompts.py` — add `setup_script_guidance` block
- `researchclaw/pipeline/executor.py` — code generation stage generates setup.py alongside main.py

**E4.1: Add `setup_script_guidance` prompt block**
```
## Setup Script (setup.py)
In addition to main.py, generate a setup.py script that handles:
1. Downloading required datasets
2. Any data preprocessing needed before the experiment

The setup.py will run WITH network access before main.py runs without network.
Use standard APIs:
- torchvision.datasets.X(root='/workspace/data', download=True)
- datasets.load_dataset('name', cache_dir='/workspace/data/hf')
- ogb.nodeproppred.PygNodePropPredDataset(name='ogbg-molhiv', root='/workspace/data')
- urllib.request.urlretrieve(url, '/workspace/data/filename')

If all datasets are pre-cached (CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN),
you may omit setup.py entirely.
```

**E4.2: Update executor code generation to emit `setup.py`**
In the code generation prompt, instruct the LLM to produce a second file `setup.py` when datasets need downloading. The executor parses the response for both `main.py` and `setup.py` and writes both to the staging directory.

**Effort:** 2 hours

### Task E5: Fix pre-cached dataset list — expand + sync with prompt [MEDIUM]
**File:** `researchclaw/docker/Dockerfile:27-30`

**Add to Dockerfile:**
```dockerfile
# Pre-cache standard datasets for offline use
RUN mkdir -p /opt/datasets && \
    python3 -c "\
import torchvision; \
torchvision.datasets.CIFAR10(root='/opt/datasets', train=True, download=True); \
torchvision.datasets.CIFAR10(root='/opt/datasets', train=False, download=True); \
torchvision.datasets.CIFAR100(root='/opt/datasets', train=True, download=True); \
torchvision.datasets.CIFAR100(root='/opt/datasets', train=False, download=True); \
torchvision.datasets.MNIST(root='/opt/datasets', train=True, download=True); \
torchvision.datasets.MNIST(root='/opt/datasets', train=False, download=True); \
torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True, download=True); \
torchvision.datasets.FashionMNIST(root='/opt/datasets', train=False, download=True); \
torchvision.datasets.STL10(root='/opt/datasets', split='train', download=True); \
torchvision.datasets.STL10(root='/opt/datasets', split='test', download=True); \
torchvision.datasets.SVHN(root='/opt/datasets', split='train', download=True); \
torchvision.datasets.SVHN(root='/opt/datasets', split='test', download=True); \
" && chmod -R a+r /opt/datasets
```

**Update `dataset_guidance` prompt** to match actual pre-cached datasets.

**Effort:** 30 min

### Task E6: Expand Dockerfile with commonly-needed ML packages [MEDIUM]
**File:** `researchclaw/docker/Dockerfile`

**Add package groups:**
```dockerfile
# Extended ML ecosystem
RUN python3 -m pip install \
    timm einops torchmetrics Pillow \
    transformers datasets accelerate peft \
    bitsandbytes sentencepiece protobuf safetensors tokenizers \
    trl evaluate rouge-score \
    h5py tensorboard

# Optional heavy packages (uncomment if needed)
# RUN python3 -m pip install torch-geometric ogb
# RUN python3 -m pip install albumentations kornia
```

**Update `builtin` set in `docker_sandbox.py:372-383`** to include new packages.
**Update `import_to_pip` dict** with new mappings.

**Effort:** 30 min

### Task E7: Fix misleading `dataset_guidance` prompt [MEDIUM]
**File:** `researchclaw/prompts.py:323-369`

**Changes:**
1. Accurately reflect which datasets are pre-cached vs. need downloading
2. Add dataset availability matrix with size info
3. Add fallback hierarchy for large datasets
4. Remove misleading "network available during setup" statement
5. Add guidance based on actual `network_policy`

**New `dataset_guidance` block structure:**
```
## Dataset Availability

### Tier 1: Pre-cached (ALWAYS available, use download=False)
CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN
→ Root: /workspace/data

### Tier 2: Downloadable (available if setup.py runs with network)
Any torchvision dataset, HuggingFace datasets, OGB benchmarks
→ Generate a setup.py to download before experiment runs

### Tier 3: Large datasets (require host-side preparation)
ImageNet (168GB), LAION (>1TB), etc.
→ Use smaller alternatives: Tiny-ImageNet (237MB, 200 classes),
  ImageNet-1k subset, or CIFAR-100 as proxy

### ANTI-PATTERNS (NEVER do these):
✗ torch.randn() "ImageNet-like" data → Use real datasets
✗ download=True in main.py → Use setup.py for downloads
✗ download=False for non-cached datasets → Will FileNotFoundError
```

**Effort:** 1 hour

### Task E8: Add `requirements.txt` generation support [LOW]
**Files:**
- `researchclaw/prompts.py` — add requirement to code_generation prompt
- `researchclaw/experiment/docker_sandbox.py` — auto-generate from detected imports

**E8.1: LLM generates `requirements.txt`**
Add to code_generation prompt:
```
If your experiment requires packages not in the standard Docker image,
include a requirements.txt file listing them (one per line with versions).
```

**E8.2: Auto-generate fallback**
In `docker_sandbox.py`, before container execution, auto-detect imports and write `requirements.txt` to staging dir if the LLM didn't provide one.

**Effort:** 1 hour

### Task E9: Add dataset registry for LLM guidance [LOW]
**File:** `researchclaw/data/dataset_registry.yaml` (NEW)
**Content:** Structured registry of common ML datasets with:
- Name, domain, size, download method
- Availability tier (pre-cached / downloadable / host-only)
- Fallback alternatives for large datasets

**Usage:** Injected into experiment_design and code_generation prompts so the LLM makes informed decisions.

**Effort:** 1 hour

### Task E10: Add `entrypoint.sh` non-root pip install support [LOW]
**File:** `researchclaw/docker/Dockerfile`
**Issue:** Container runs as `researcher` (non-root) but pip install needs root (or `--break-system-packages --user`).
**Fix:** In `entrypoint.sh`, use `--user` flag for pip install, or run as root and drop privileges before Phase 2.
**Alternative:** Use `--user 0:0` for the container and run experiment code under `researcher` via `su -c`.

**Effort:** 30 min

---

## 5. Implementation Priority & Dependencies

```
E1 (pkg_hint fix)          ─────────── Immediate, no deps
    ↓
E5 (pre-cache datasets)    ─────────── Dockerfile change
E6 (expand packages)       ─────────── Dockerfile change (parallel with E5)
    ↓
E2 (single-container)      ─────────── Core architecture fix
E3 (default policy)        ─────────── After E2
E10 (non-root pip)         ─────────── After E2
    ↓
E4 (setup.py generation)   ─────────── After E2+E3
E7 (fix prompt guidance)   ─────────── After E5
E8 (requirements.txt)      ─────────── After E2
    ↓
E9 (dataset registry)      ─────────── Last (enhancement)
```

**Phase A (Critical, Day 1):** E1 + E2 + E3 + E5 + E6 + E7 + E10
**Phase B (Important, Day 2):** E4 + E8 + E9

---

## 6. Testing Plan

### Unit Tests
- [ ] `test_docker_sandbox.py`: Test single-container execution with entrypoint.sh
- [ ] `test_docker_sandbox.py`: Test requirements.txt auto-generation
- [ ] `test_docker_sandbox.py`: Test setup.py execution in container
- [ ] `test_docker_sandbox.py`: Test network cutoff in `setup_only` mode
- [ ] `test_prompts.py`: Verify pkg_hint includes all installed packages
- [ ] `test_prompts.py`: Verify dataset_guidance matches Dockerfile pre-cached list

### Integration Tests
- [ ] Docker build succeeds with expanded packages and datasets
- [ ] Experiment using CIFAR-10 (pre-cached) runs without network
- [ ] Experiment using STL-10 (newly pre-cached) runs without network
- [ ] Experiment requiring timm installs it via requirements.txt
- [ ] Experiment with setup.py downloads HuggingFace dataset
- [ ] Network is actually disabled after setup in `setup_only` mode

### E2E Regression
- [ ] Run pipeline with topic "Image Classification on CIFAR-10" → uses real CIFAR-10
- [ ] Run pipeline with topic "Vision Transformer on STL-10" → uses real STL-10
- [ ] Run pipeline with topic "Sentiment Analysis on IMDB" → downloads IMDB via setup.py
- [ ] No experiment produces synthetic/random data as a dataset substitute

---

## 7. Risk Assessment

| Risk | Severity | Mitigation |
|------|----------|------------|
| iptables in container needs NET_ADMIN cap | Medium | Fallback to `ip route del default`; document cap requirement |
| entrypoint.sh changes break existing Docker images | High | Version the image tag (`:v2`); test with both entrypoints |
| Large pre-cached datasets bloat Docker image | Medium | Use multi-stage build; keep optional packages commented |
| HuggingFace download timeouts | Low | Set timeout in setup.py; retry logic |
| LLM generates malicious pip packages | Low | Existing code security validation catches subprocess/network calls |

---

## 8. Files to Modify/Create

| Action | File | Tasks |
|--------|------|-------|
| CREATE | `researchclaw/docker/entrypoint.sh` | E2 |
| CREATE | `researchclaw/data/dataset_registry.yaml` | E9 |
| MODIFY | `researchclaw/docker/Dockerfile` | E2, E5, E6, E10 |
| MODIFY | `researchclaw/experiment/docker_sandbox.py` | E2, E3, E8 |
| MODIFY | `researchclaw/config.py` | E3 |
| MODIFY | `researchclaw/pipeline/executor.py` | E1, E4 |
| MODIFY | `researchclaw/prompts.py` | E4, E7 |
| MODIFY | `tests/test_docker_sandbox.py` | Tests |

---

## 9. Comparison: Before vs. After

| Aspect | Before | After |
|--------|--------|-------|
| Available packages in prompt | 9 packages listed | 15+ packages listed |
| Runtime pip install | Broken (Phase 1/2 isolation bug) | Working (single container) |
| Default network policy | `"none"` (no install, no download) | `"setup_only"` (install+download, then isolated) |
| Pre-cached datasets | 2 (CIFAR-10, FashionMNIST) | 6 (+ CIFAR-100, MNIST, STL-10, SVHN) |
| Dataset download support | None | setup.py with network access |
| Dockerfile ML packages | ~15 packages | ~30+ packages |
| Large dataset handling | Falls back to synthetic | Fallback hierarchy + alternatives |
| Requirements declaration | None | requirements.txt + auto-detect |


================================================
FILE: docs/showcase/SHOWCASE.md
================================================
<h1 align="center">🏆 Generated Paper Showcase</h1>

<p align="center">
  <i>From a one-line idea to a conference-ready paper — fully autonomous, zero human intervention.</i>
</p>

<p align="center">
  <img src="https://img.shields.io/badge/🔬_Pipeline-23_Stages-0969DA?style=for-the-badge&labelColor=1a1a2e" alt="23 Stages">&nbsp;
  <img src="https://img.shields.io/badge/📄_Papers-8_Completed-2ea043?style=for-the-badge&labelColor=1a1a2e" alt="8 Papers">&nbsp;
  <img src="https://img.shields.io/badge/💻_Code-54%2C348_Lines-f97316?style=for-the-badge&labelColor=1a1a2e" alt="54k LOC">&nbsp;
  <img src="https://img.shields.io/badge/⏱️_Runtime-~27_Hours-a855f7?style=for-the-badge&labelColor=1a1a2e" alt="~27h Runtime">
</p>

<p align="center">
  <img src="https://img.shields.io/badge/📚_Literature-1%2C547%2B_Papers_Surveyed-3b82f6?style=flat-square" alt="1547+ papers">&nbsp;
  <img src="https://img.shields.io/badge/📊_Figures-50_Auto--Generated-10b981?style=flat-square" alt="50 figures">&nbsp;
  <img src="https://img.shields.io/badge/📑_Output-121_Pages_Total-ef4444?style=flat-square" alt="121 pages">&nbsp;
  <img src="https://img.shields.io/badge/🔗_References-291_Cited_(99.7%25_Verified)-8b5cf6?style=flat-square" alt="291 refs">
</p>

---

Below are **eight papers** generated **entirely by AutoResearchClaw** — each starting from nothing more than a topic sentence. The pipeline autonomously searched literature, designed experiments, wrote and executed code, generated figures, and produced NeurIPS-formatted LaTeX papers with verified references.

> 📌 **Two batches, eight domains** — Batch A covers mathematics, statistics, biology, and numerical computing; Batch B covers NLP, reinforcement learning, computer vision, and knowledge distillation — demonstrating the pipeline's cross-domain generality.

---

## 🔄 How It Works

<table>
<tr>
<td align="center" width="12%">

**💡**<br>**Idea**
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**📚**<br>**Literature**<br><sub>300–470 papers</sub>
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**🧪**<br>**Hypothesis**<br><sub>experiment design</sub>
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**💻**<br>**Code**<br><sub>2K–15K lines</sub>
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**🔬**<br>**Execute**<br><sub>sandbox + refine</sub>
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**📝**<br>**Write**<br><sub>review & audit</sub>
</td>
<td align="center" width="3%">➜</td>
<td align="center" width="12%">

**📄**<br>**Paper**<br><sub>NeurIPS PDF</sub>
</td>
</tr>
</table>

<p align="center"><sub>Each run traverses <b>23 autonomous stages</b> with iterative self-healing, multi-agent peer review, and citation verification — no human in the loop.</sub></p>

---

<h2 align="center">📘 Batch A &nbsp;·&nbsp; Mathematics, Statistics & Sciences</h2>

<p align="center"><sub>Generated on Machine A &nbsp;·&nbsp; 4 papers across 4 non-ML domains</sub></p>

---

### 📄 Paper I &nbsp;·&nbsp; Random Matrix Theory &ensp; <img src="https://img.shields.io/badge/domain-mathematics-blue?style=flat-square" alt="math">

> **Finite-Dimensional Corrections to the Marchenko–Pastur Distribution in Random Wishart Matrices**

<table>
<tr>
<td width="340">
<a href="papers/paper_I_random_matrix.pdf">
<img src="thumbnails/paper_I_random_matrix-01.png" width="320" alt="Paper I First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Systematically quantify pre-asymptotic, finite-*N* deviations of empirical eigenvalue densities from the Marchenko–Pastur law across *N* = 50 to 5,000, decomposing error into bulk vs. edge components and testing lightweight correction models.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 473 papers collected → 26 cited |
| 💻 **Code** | 10,290 lines of Python |
| ⏱️ **Runtime** | ~2 h 25 min |
| 📊 **Figures** | 5 auto-generated charts |
| 📑 **Pages** | 16 pages (NeurIPS format) |

#### 🎯 Key Result
Produced a finite-*N* correction atlas showing convergence rates of spectral densities, with edge deviations persisting significantly longer than bulk errors — providing practical guidance for when the MP law is "close enough."

<a href="papers/paper_I_random_matrix.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — MPCX Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_I_random_matrix.png" width="90%" alt="MPCX Framework Diagram">
</p>
<p align="center"><sub>Finite-dimensional correction pipeline: Wishart matrix generation → empirical spectral density estimation → MP baseline comparison → bulk/edge error decomposition → correction model fitting. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper II &nbsp;·&nbsp; Econometrics &ensp; <img src="https://img.shields.io/badge/domain-statistics-green?style=flat-square" alt="stats">

> **Monte Carlo Evaluation of Instrumental Variable Estimators Under Weak Instruments**

<table>
<tr>
<td width="340">
<a href="papers/paper_II_weak_iv_estimators.pdf">
<img src="thumbnails/paper_II_weak_iv_estimators-01.png" width="320" alt="Paper II First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Reframe the classical 2SLS / LIML / Fuller-*k* / JIVE comparison around decision-relevant *risk surfaces*, mapping finite-sample phase diagrams that show where each estimator is preferred under realistic weak-IV conditions.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 366 papers collected → 41 cited |
| 💻 **Code** | 10,062 lines of Python |
| ⏱️ **Runtime** | ~2 h 56 min |
| 📊 **Figures** | 6 auto-generated charts |
| 📑 **Pages** | 14 pages (NeurIPS format) |

#### 🎯 Key Result
Generated estimator-switching phase diagrams revealing that Fuller-*k* dominates in specific small-*n*, many-instrument regions, while JIVE's bias reduction is systematically offset by variance inflation — providing actionable guidance for empirical researchers.

<a href="papers/paper_II_weak_iv_estimators.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — IVX Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_II_weak_iv_estimators.png" width="90%" alt="IVX Framework Diagram">
</p>
<p align="center"><sub>Monte Carlo IV evaluation pipeline: DGP specification → estimator suite (2SLS, LIML, Fuller-k, JIVE) → finite-sample risk surfaces → phase diagram construction. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper III &nbsp;·&nbsp; Epidemiological Modeling &ensp; <img src="https://img.shields.io/badge/domain-biology-orange?style=flat-square" alt="bio">

> **Structural Identifiability and Parameter Estimation in Compartmental Epidemic Models (SIR / SEIR)**

<table>
<tr>
<td width="340">
<a href="papers/paper_III_sir_seir_identifiability.pdf">
<img src="thumbnails/paper_III_sir_seir_identifiability-01.png" width="320" alt="Paper III First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Map the boundary between structural and practical identifiability in SIR vs. SEIR models across realistic observation regimes, and quantify when Fisher Information Matrix gives false confidence relative to profile likelihood.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 388 papers collected → 29 cited |
| 💻 **Code** | 9,374 lines of Python |
| ⏱️ **Runtime** | ~2 h 23 min |
| 📊 **Figures** | 6 auto-generated charts |
| 📑 **Pages** | 18 pages (NeurIPS format) |

#### 🎯 Key Result
Demonstrated that parameterization and observer design choices affect identifiability diagnostics more strongly than the choice between SIR and SEIR structure — with FIM producing overconfident estimates in specific observation-limited regimes where profile likelihood correctly flags non-identifiability.

<a href="papers/paper_III_sir_seir_identifiability.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — PRIM Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_III_sir_seir_identifiability.png" width="90%" alt="PRIM Framework Diagram">
</p>
<p align="center"><sub>PRIM benchmark workflow: synthetic outbreak generation (SIR/SEIR) → parameter estimation → profile likelihood vs. FIM diagnostics → identifiability regime mapping. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper IV &nbsp;·&nbsp; Numerical Linear Algebra &ensp; <img src="https://img.shields.io/badge/domain-computing-purple?style=flat-square" alt="computing">

> **Comparative Analysis of Preconditioning Strategies for Krylov Subspace Methods on Sparse Linear Systems**

<table>
<tr>
<td width="340">
<a href="papers/paper_IV_krylov_preconditioners.pdf">
<img src="thumbnails/paper_IV_krylov_preconditioners-01.png" width="320" alt="Paper IV First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Go beyond "which preconditioner wins" — build a feature-conditioned decision map for ILU / Jacobi / SSOR / AMG with CG / GMRES / BiCGSTAB, stratified by sparsity-graph structure and matrix pathology under realistic setup-vs-solve cost budgets.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 320 papers collected → 33 cited |
| 💻 **Code** | 14,557 lines of Python |
| ⏱️ **Runtime** | ~2 h 30 min |
| 📊 **Figures** | 4 auto-generated charts |
| 📑 **Pages** | 16 pages (NeurIPS format) |

#### 🎯 Key Result
Produced a setup-vs-solve tradeoff analysis showing that methods considered "best" under solve-time alone are often suboptimal under realistic memory and setup budgets — with AMG dominance limited to specific elliptic SPD matrix families.

<a href="papers/paper_IV_krylov_preconditioners.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — Krylov Preconditioner Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_IV_krylov_preconditioners.png" width="90%" alt="Krylov Preconditioner Framework Diagram">
</p>
<p align="center"><sub>Feature-conditioned preconditioner evaluation: sparse matrix collection → structural descriptor extraction → solver–preconditioner grid (CG/GMRES/BiCGSTAB × ILU/Jacobi/SSOR/AMG) → setup-vs-solve tradeoff analysis → decision map. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

<h2 align="center">📙 Batch B &nbsp;·&nbsp; Machine Learning & AI</h2>

<p align="center"><sub>Generated on Machine B &nbsp;·&nbsp; NVIDIA RTX 6000 Ada (48 GB) &nbsp;·&nbsp; 4 papers across 4 ML sub-fields</sub></p>

---

### 📄 Paper V &nbsp;·&nbsp; Parameter-Efficient Fine-Tuning &ensp; <img src="https://img.shields.io/badge/domain-NLP%20/%20PEFT-blue?style=flat-square" alt="NLP">

> **GARD: Gradient-Spectral Rank Allocation for LoRA Fine-Tuning**

<table>
<tr>
<td width="340">
<a href="papers/paper_V_gard_lora.pdf">
<img src="thumbnails/paper_V_gard_lora-01.png" width="320" alt="Paper V First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Most LoRA configurations use a fixed, uniform rank across all layers. GARD proposes using the *spectrum of layer-wise gradients* — eigenvalues of gradient covariance — to dynamically allocate rank where it matters most, under a strict parameter budget.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 60 references cited (100% verified) |
| 💻 **Code** | 2,894 lines of Python (5 files) |
| ⏱️ **Runtime** | ~50 min |
| 📊 **Figures** | 7 auto-generated charts |
| 📑 **Pages** | 17 pages (NeurIPS format) |

#### 🎯 Key Contribution
A principled alternative to uniform rank allocation: GARD links intrinsic gradient dimensionality to low-rank adapter capacity, periodically updating ranks during training using smoothed spectra.

<a href="papers/paper_V_gard_lora.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — GARD Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_V_gard_lora.png" width="90%" alt="GARD Framework Diagram">
</p>
<p align="center"><sub>Gradient spectral analysis → layer-wise rank scoring → dynamic rank allocation under budget constraint. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper VI &nbsp;·&nbsp; Reinforcement Learning &ensp; <img src="https://img.shields.io/badge/domain-RL%20/%20Exploration-green?style=flat-square" alt="RL">

> **LACE: Learned Abstractions for Count-Based Exploration in Sparse-Reward RL**

<table>
<tr>
<td width="340">
<a href="papers/paper_VI_lace_exploration.pdf">
<img src="thumbnails/paper_VI_lace_exploration-01.png" width="320" alt="Paper VI First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Count-based exploration in RL relies on state visitation counts, but raw state spaces are too large for effective counting. LACE designs *online-learned, task-aware state abstractions* optimized specifically for count-based exploration in sparse-reward environments.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 25 references cited (100% verified) |
| 💻 **Code** | 2,067 lines of Python (4 files) |
| 🐳 **Experiment** | 32 min GPU sandbox execution |
| ⏱️ **Runtime** | ~6.8 hrs total |
| 📊 **Figures** | 6 auto-generated charts |
| 📑 **Pages** | 11 pages (NeurIPS format) |

#### 🎯 Key Result
DQN baseline achieves **356.7 mean reward** in sparse-reward gridworld tasks. The paper analyzes the trade-off between abstraction compactness for counting and information sufficiency for downstream control.

<a href="papers/paper_VI_lace_exploration.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — LACE Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_VI_lace_exploration.png" width="90%" alt="LACE Framework Diagram">
</p>
<p align="center"><sub>Learned state abstraction module integrated with count-based exploration in the DQN agent loop. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper VII &nbsp;·&nbsp; Efficient Vision Transformers &ensp; <img src="https://img.shields.io/badge/domain-Computer_Vision-orange?style=flat-square" alt="CV">

> **FAME: Frequency-Aware Progressive Token Merging for Efficient ViT Inference**

<table>
<tr>
<td width="340">
<a href="papers/paper_VII_fame_token_merging.pdf">
<img src="thumbnails/paper_VII_fame_token_merging-01.png" width="320" alt="Paper VII First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Existing ViT token pruning methods reduce tokens based on attention or saliency without considering *frequency content*. FAME uses DCT/FFT-based spectral filters to distinguish high-frequency detail tokens from low-frequency background tokens, merging progressively across layers.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 40 references cited (100% verified) |
| 💻 **Code** | 2,873 lines of Python (5 files) |
| 🐳 **Experiment** | 32 min GPU sandbox execution |
| ⏱️ **Runtime** | ~3.3 hrs total |
| 📊 **Figures** | 7 auto-generated charts |
| 📑 **Pages** | 10 pages (NeurIPS format) |

#### 🎯 Key Result
ViT-B/16 baseline: **56.54% accuracy** (3 seeds). Detailed analysis of the accuracy-efficiency tradeoff and per-layer metric breakdowns for frequency-aware vs. similarity-based merging.

<a href="papers/paper_VII_fame_token_merging.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — FAME Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_VII_fame_token_merging.png" width="90%" alt="FAME Framework Diagram">
</p>
<p align="center"><sub>Frequency-aware token merging applied progressively across ViT layers with DCT-based spectral filtering. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

### 📄 Paper VIII &nbsp;·&nbsp; Knowledge Distillation &ensp; <img src="https://img.shields.io/badge/domain-Robustness%20/%20KD-purple?style=flat-square" alt="KD">

> **CRAFT: Contrastive Feature Alignment for Robust Distillation Under Distribution Shift**

<table>
<tr>
<td width="340">
<a href="papers/paper_VIII_craft_distillation.pdf">
<img src="thumbnails/paper_VIII_craft_distillation-01.png" width="320" alt="Paper VIII First Page" style="border: 1px solid #e1e4e8; border-radius: 6px;">
</a>
<p align="center"><sub>👆 Click to read the full paper</sub></p>
</td>
<td>

#### 💡 Idea
Standard knowledge distillation transfers teacher knowledge assuming train/test distributions match. CRAFT introduces *reliability-aware contrastive feature alignment* that aligns teacher-student features across clean and corrupted views, while suppressing fragile teacher directions via a de-alignment loss.

#### ⚙️ Pipeline Journey

| | |
|:---|:---|
| 🔗 **Stages** | 23 stages + 2 refinement iterations |
| 📚 **Literature** | 37 references cited (97% verified) |
| 💻 **Code** | 2,231 lines of Python (4 files) |
| 🐳 **Experiment** | 33 min GPU sandbox execution |
| ⏱️ **Runtime** | ~5.8 hrs total |
| 📊 **Figures** | 9 auto-generated charts |
| 📑 **Pages** | 19 pages (NeurIPS format) |

#### 🎯 Key Result

| Method | Clean Acc | Robust Acc |
|:---|:---:|:---:|
| ERM (baseline) | 81.22% | 62.96% |
| LogitKD | 82.33% | 64.68% |
| **AttentionKD** | **82.08%** | **65.95%** |
| CRD | 68.03% | 50.57% |

Attention-based feature KD improves robustness by **+3 pts** over ERM, while naive CRD degrades it by **-12 pts** — motivating CRAFT's reliability-aware design.

<a href="papers/paper_VIII_craft_distillation.pdf"><img src="https://img.shields.io/badge/📄_Read_Full_Paper-PDF-d73a49?style=for-the-badge" alt="Read PDF"></a>

</td>
</tr>
</table>

<details>
<summary>🖼️ <b>Auto-Generated Framework Diagram</b> — CRAFT Architecture</summary>
<br>
<p align="center">
<img src="thumbnails/framework_VIII_craft_distillation.png" width="90%" alt="CRAFT Framework Diagram">
</p>
<p align="center"><sub>Reliability-aware contrastive feature alignment between teacher and student across clean and corrupted views, with de-alignment on fragile teacher directions. Entirely auto-generated by the FigureAgent subsystem.</sub></p>
</details>

---

## 📊 Aggregate Statistics

<table>
<tr>
<th align="left">📋 Metric</th>
<th align="center">I</th>
<th align="center">II</th>
<th align="center">III</th>
<th align="center">IV</th>
<th align="center">V</th>
<th align="center">VI</th>
<th align="center">VII</th>
<th align="center">VIII</th>
<th align="center">🏆 Total</th>
</tr>
<tr>
<td>🏷️ <b>Domain</b></td>
<td align="center"><sub>Math</sub></td>
<td align="center"><sub>Stats</sub></td>
<td align="center"><sub>Bio</sub></td>
<td align="center"><sub>NumLA</sub></td>
<td align="center"><sub>NLP</sub></td>
<td align="center"><sub>RL</sub></td>
<td align="center"><sub>CV</sub></td>
<td align="center"><sub>KD</sub></td>
<td align="center"><b>8 fields</b></td>
</tr>
<tr>
<td>💻 <b>Code (LOC)</b></td>
<td align="center">10,290</td>
<td align="center">10,062</td>
<td align="center">9,374</td>
<td align="center">14,557</td>
<td align="center">2,894</td>
<td align="center">2,067</td>
<td align="center">2,873</td>
<td align="center">2,231</td>
<td align="center"><b>54,348</b></td>
</tr>
<tr>
<td>⏱️ <b>Pipeline Time</b></td>
<td align="center">2h25m</td>
<td align="center">2h56m</td>
<td align="center">2h23m</td>
<td align="center">2h30m</td>
<td align="center">50m</td>
<td align="center">6h48m</td>
<td align="center">3h18m</td>
<td align="center">5h48m</td>
<td align="center"><b>~27 hrs</b></td>
</tr>
<tr>
<td>🔗 <b>References</b></td>
<td align="center">26</td>
<td align="center">41</td>
<td align="center">29</td>
<td align="center">33</td>
<td align="center">60</td>
<td align="center">25</td>
<td align="center">40</td>
<td align="center">37</td>
<td align="center"><b>291 cited</b></td>
</tr>
<tr>
<td>📊 <b>Figures</b></td>
<td align="center">5</td>
<td align="center">6</td>
<td align="center">6</td>
<td align="center">4</td>
<td align="center">7</td>
<td align="center">6</td>
<td align="center">7</td>
<td align="center">9</td>
<td align="center"><b>50 figs</b></td>
</tr>
<tr>
<td>📑 <b>Pages</b></td>
<td align="center">16</td>
<td align="center">14</td>
<td align="center">18</td>
<td align="center">16</td>
<td align="center">17</td>
<td align="center">11</td>
<td align="center">10</td>
<td align="center">19</td>
<td align="center"><b>121 pages</b></td>
</tr>
</table>

---

<h3 align="center">🚀 Try It Yourself</h3>

<p align="center">Every paper above was generated by a single command:</p>

```bash
researchclaw run --topic "Your research idea here" --auto-approve
```

<p align="center">
  <a href="../../README.md"><img src="https://img.shields.io/badge/←_Back_to_README-Main-gray?style=for-the-badge" alt="Back"></a>&nbsp;
  <a href="https://github.com/aiming-lab/AutoResearchClaw"><img src="https://img.shields.io/badge/⭐_Star_on_GitHub-AutoResearchClaw-181717?style=for-the-badge&logo=github" alt="GitHub"></a>
</p>


================================================
FILE: prompts.default.yaml
================================================
# =============================================================================
# AutoResearchClaw — Default Prompt Templates
# =============================================================================
#
# Copy this file, edit any prompt you want to customize, and point your config
# to the copy:
#
#   prompts:
#     custom_file: "my_prompts.yaml"
#
# Template variables use {var_name} syntax — see docs/integration-guide.md
# for a list of available variables per stage.
#
# Stages without an entry here (experiment_run, citation_verify) do not call
# the LLM and therefore have no prompts to customize.
# =============================================================================

blocks:
  compute_budget: |
    ## Compute Budget Constraint
    - Total execution time limit: {time_budget_sec} seconds
    - You MUST design experiments that complete within this budget
    - Estimate: a simple numpy loop runs ~10M iterations/sec; a nested loop over
      conditions runs proportionally slower
    - SCALING RULES (mandatory):
      - If total conditions > 100: reduce seeds to 3-5 (not 20)
      - If total conditions > 500: reduce to 2-3 representative conditions per factor
      - If time_budget < 300s: limit total optimization steps to ≤5,000 per run
      - If time_budget < 120s: limit total optimization steps to ≤1,000 per run
      - Always print intermediate results so partial data is captured on timeout
    - MANDATORY: print a "TIME_ESTIMATE: Xs" line before the main loop,
      estimating total runtime based on a small pilot (run 1 condition, extrapolate)
    - MANDATORY: implement a time guard — check elapsed time periodically and
      stop gracefully if approaching 80% of budget, saving all results collected so far

  pkg_hint_sandbox: '

    AVAILABLE PACKAGES (sandbox mode): Python stdlib, numpy, math, random, statistics, json.

    Do NOT use: torch, tensorflow, jax, sklearn, pandas, scipy, matplotlib, or any deep learning framework.

    Write the experiment using ONLY numpy and stdlib.

    '
  topic_constraint: '


    === HARD TOPIC CONSTRAINT ===

    The paper MUST be about: {topic}

    PROHIBITED content (unless user explicitly specifies case-study mode):

    - Do NOT treat environment setup, dependency installation, or infrastructure failures as a research contribution.

    - Do NOT present debugging logs, system errors, or configuration issues as experimental findings.

    - Do NOT drift to tangential topics not directly related to the stated topic.

    - Every section MUST connect back to the core research question.

    - The Abstract and Introduction MUST clearly state the research problem derived from: {topic}

    - The Method section MUST describe a technical approach, not a workflow.

    - The Results section MUST report quantitative outcomes of experiments, not environment status.

    === END CONSTRAINT ===

    '
stages:
  code_generation:
    max_tokens: 8192
    system: You are a computational scientist who writes real, runnable experiments. Your code implements actual algorithms
      with real mathematical operations. You NEVER fake results with random number generators. Always use the ```filename:xxx.py
      format for each file. Use numpy for numerical computation. Keep code self-contained and deterministic.
    user: "Generate a Python experiment project for the following research topic:\nTOPIC: {topic}\n\nCRITICAL REQUIREMENTS\
      \ — your code MUST satisfy ALL of these:\n1. Implement REAL algorithms (e.g., gradient descent, Adam, SGD, etc.)\n \
      \  using numpy arrays — NOT random.uniform() loops that fake results.\n2. Define REAL objective/loss functions (e.g.,\
      \ Rosenbrock, quadratic,\n   cross-entropy on synthetic data) with proper mathematical formulas.\n3. Run REAL optimization\
      \ loops that compute gradients and update parameters.\n4. Collect REAL metrics (loss values, convergence rates) from\
      \ the optimization.\n5. The code must be scientifically meaningful — a reviewer should see\n   actual algorithm implementations,\
      \ not random number generators.\n\nOUTPUT FORMAT — return multiple files using this exact format:\n```filename:main.py\n\
      # entry point code\n```\n\n```filename:optimizers.py\n# optimizer implementations\n```\n\nCODE STRUCTURE:\n- main.py:\
      \ entry point that runs experiments and prints metrics\n- Additional modules for algorithms, objective functions, utilities\n\
      - Primary metric key: {metric}\n- main.py must print metric lines as `name: value` (one per line)\n- main.py must ALSO\
      \ write a `results.json` file with structured experiment results\n  (e.g. per-algorithm, per-function, per-dimension metrics\
      \ as nested dicts/lists)\n- Use deterministic seeds (numpy.random.seed or random.seed)\n- No external data files, no\
      \ network calls, no GPU required\n- FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket\n- MUST implement convergence\
      \ stopping criteria (e.g. stop when objective change < 1e-8 for\n  N consecutive iterations) — do NOT just run a fixed\
      \ number of iterations\n{pkg_hint}\nANTI-PATTERNS (do NOT do these):\n- Do NOT generate random numbers and pretend they\
      \ are experiment results\n- Do NOT use `random.uniform()` to simulate a decreasing loss curve\n- Do NOT hardcode metric\
      \ values or use trivial arithmetic as metrics\n- Do NOT run a fixed number of iterations without any convergence check\n- Do NOT implement convergence_rate or similar metrics as dummy return values\n  (e.g. returning 1.0 or a constant) — measure actual iterations to convergence\n- If you report convergence_rate, define it as iterations_to_convergence / max_iterations\n  or similar — it MUST differ between algorithms\n\nNUMPY 2.x COMPATIBILITY (CRITICAL):\n- np.trapz is REMOVED → use np.trapezoid\n- np.erfinv does NOT exist → use scipy.special.erfinv\n- np.bool, np.int, np.float, np.complex are REMOVED → use Python builtins\n- np.str, np.object are REMOVED → use str, object\n- np.math is REMOVED → use math module\n\nExperiment plan:\n{exp_plan}"
  experiment_design:
    system: You are a principal investigator designing ML experiments.
    user: '{preamble}


      Design an experiment plan as YAML.

      Required keys: objectives,datasets,baselines,proposed_methods,ablations,metrics,risks,compute_budget.

      Hypotheses:

      {hypotheses}'
  export_publish:
    max_tokens: 16384
    system: You are a publication formatting editor.
    user: 'Format revised paper into clean final markdown for publication export.

      Preserve content quality and readability.

      Input paper:

      {revised}'
  hypothesis_gen:
    system: You formulate testable scientific hypotheses.
    user: 'Generate at least 2 falsifiable hypotheses from synthesis.

      Output markdown and for each hypothesis provide rationale, measurable prediction, failure condition.

      Synthesis:

      {synthesis}'
  knowledge_archive:
    system: You produce reproducibility-focused research retrospectives.
    user: '{preamble}


      Write retrospective archive markdown with lessons, reproducibility notes, and future work.

      Decision:

      {decision}


      Analysis:

      {analysis}


      Revised paper:

      {revised}'
  knowledge_extract:
    json_mode: true
    system: You extract high-signal evidence cards from papers.
    user: 'Extract structured knowledge cards from shortlist.

      Return JSON: {cards:[{card_id,title,cite_key,problem,method,data,metrics,findings,limitations,citation}]}.

      IMPORTANT: If the input contains cite_key fields, preserve them exactly in the output.

      Shortlist:

      {shortlist}'
  literature_collect:
    json_mode: true
    system: You are a literature mining assistant.
    user: 'Generate candidate papers from the search plan.

      Return JSON: {candidates:[...]} with >=20 rows.

      Each candidate must include id,title,source,url,year,abstract,collected_at.

      Topic: {topic}

      Search plan:

      {plan_text}'
  literature_screen:
    json_mode: true
    system: You are a strict domain-aware reviewer. Reject off-topic papers aggressively.
    user: 'Perform merged relevance+quality screening and return shortlist.

      Return JSON: {shortlist:[...]} each with title, cite_key (if present), relevance_score (0-1), quality_score (0-1), keep_reason.

      Preserve all original fields (paper_id, doi, arxiv_id, cite_key, etc.) from the input.

      Topic: {topic}

      Domains: {domains}

      Threshold: {quality_threshold}

      IMPORTANT: Only keep papers genuinely relevant to the topic above. Reject papers about unrelated domains even if they
      are high quality.

      Candidates JSONL:

      {candidates_text}'
  paper_draft:
    max_tokens: 32768
    system: "You are a top-tier ML paper author writing for NeurIPS/ICML/ICLR.\n\n\
      KEY PRINCIPLES (from accepted paper analyses):\n\
      1. NOVELTY: A good paper has 1-2 key ideas and keeps the rest simple. Think sushi, not curry.\n\
      2. NARRATIVE: The paper is a short, rigorous, evidence-based technical story with a takeaway readers care about.\n\
      3. FIGURE 1: The most important figure. It should convey whatever is most important — many readers go straight to Figure 1.\n\
      4. STRONG BASELINES: Invest real effort in making baselines competitive. Reviewers catch weak baselines.\n\
      5. ABLATIONS: Remove one component at a time and measure the effect. Without ablations, reviewers cannot tell which parts matter.\n\
      6. HONESTY: Acknowledge limitations explicitly. Papers that don't are substantially weaker.\n\
      7. CONTRIBUTIONS: State contributions clearly in Abstract AND Introduction. Many reviewers stop reading carefully after the intro.\n\
      8. REPRODUCIBILITY: Include all details needed to reproduce: hyperparameters, data processing, random seeds, hardware specs.\n\n\
      COMMON REJECTION REASONS (avoid these):\n\
      - Overclaiming: match claims to evidence\n\
      - Missing ablations: systematically demonstrate each component's contribution\n\
      - Weak baselines: tune baselines with the same effort as your method\n\
      - Poor reproducibility: include every detail needed to replicate\n\n\
      You ONLY use real experimental data — never fabricate or approximate numbers. Every metric value must exactly match the provided experiment output.\n\
      You write at the depth and length expected for a 9-page conference paper (approximately 5000-6500 words in the main body, excluding references)."
    user: '{preamble}


      Write a FULL-LENGTH paper draft section by section in markdown. This paper must be suitable for submission to a top-tier ML conference (NeurIPS, ICML, ICLR).

      CRITICAL LENGTH REQUIREMENTS — each section MUST meet its minimum word count:

      1. **Title**: Concise, informative (10-15 words)
      2. **Abstract** (150-250 words): Problem, method, key results with numbers, conclusion
      3. **Introduction** (800-1000 words): Motivation with real-world context, problem statement, research gap analysis, brief method overview, contribution list (3-4 bullet points), paper organization
      4. **Related Work** (600-800 words): Organized by 3-4 thematic groups, each with 4-5 citations. Compare and contrast approaches, identify limitations of prior work, position this work clearly
      5. **Method** (1000-1500 words): Formal problem definition with mathematical notation, detailed algorithm description with equations, complexity analysis, design rationale for key choices
      6. **Experiments** (800-1200 words): Detailed experimental setup (datasets, preprocessing, data splits), baselines and their implementations, hyperparameter settings (in a table), evaluation metrics with justification, hardware and runtime information
      7. **Results** (600-800 words): Main results table(s) with ALL metrics, per-condition analysis, statistical significance discussion, ablation studies, qualitative analysis where relevant
      8. **Discussion** (400-600 words): Interpretation of key findings, unexpected results analysis, comparison with prior work, practical implications
      9. **Limitations** (200-300 words): Honest assessment of scope, dataset, methodology, and generalizability limitations
      10. **Conclusion** (200-300 words): Summary of contributions, main findings, and concrete future work directions

      TOTAL TARGET: 5000-6500 words in the main body. If any section is shorter than its minimum, EXPAND it with substantive technical content — NOT filler.

      QUALITY STANDARDS:
      - Use formal academic language throughout
      - Include mathematical notation where appropriate (use LaTeX-style $...$ for inline math)
      - Every claim must be supported by either a citation or experimental evidence
      - Results tables should use markdown table format with proper column headers
      - Provide algorithm pseudocode in the Method section when applicable

      Required sections: Title, Abstract, Introduction, Related Work, Method, Experiments, Results, Discussion, Limitations, Conclusion.
      Do NOT include a References section — it will be auto-generated.

      {topic_constraint}{exp_metrics_instruction}{citation_instruction}Outline:

      {outline}'
  paper_outline:
    max_tokens: 8192
    system: You are an academic writing planner.
    user: '{preamble}


      Create a detailed paper outline in markdown.

      Include per-section goals and evidence links.

      {topic_constraint}{feedback}Analysis:

      {analysis}


      Decision:

      {decision}'
  paper_revision:
    max_tokens: 32768
    system: You are a paper revision expert for NeurIPS/ICML/ICLR submissions. When revising, NEVER shorten existing sections — only expand, improve, and add content. The final paper must be at least as long as the draft.
    user: 'Revise the paper draft to address all review comments.

      CRITICAL: Maintain or INCREASE the paper length. Each section must meet its minimum word count:
      Abstract (150-250), Introduction (800-1000), Related Work (600-800), Method (1000-1500),
      Experiments (800-1200), Results (600-800), Discussion (400-600), Limitations (200-300), Conclusion (200-300).

      Return revised markdown only.

      {topic_constraint}Draft:

      {draft}


      Reviews:

      {reviews}'
  peer_review:
    max_tokens: 8192
    system: You are a balanced conference reviewer who is rigorous about
      methodology-evidence consistency.
    user: 'Simulate peer review from at least 2 reviewer perspectives.

      Output markdown with Reviewer A and Reviewer B, each including strengths,
      weaknesses, and actionable revisions.

      Check specifically:

      1. Does the paper stay on topic ({topic})? Flag any sections where the paper
      drifts to unrelated topics or presents environment issues as contributions.

      2. METHODOLOGY-EVIDENCE CONSISTENCY: Compare the paper''s claims about
      experimental setup (number of trials, statistical tests, hyperparameters,
      baselines) against the actual experiment evidence provided below. Flag any
      discrepancies where the paper claims something that is NOT supported by the
      actual code or results. For example:
      - Paper claims N trials but code shows a different number
      - Paper claims statistical tests (ANOVA, t-test) but code has none
      - Paper reports metrics not present in actual results
      - Paper describes methods not implemented in code

      3. TRIAL COUNT: The actual number of experiment runs is stated in the evidence below. If the paper claims a DIFFERENT number of trials (e.g., "100 independent trials" when only 1 was run), flag this as a CRITICAL fabrication that MUST be corrected.

      4. PAPER LENGTH: This paper targets NeurIPS/ICML submission (9 pages). Check that each section has adequate depth. Flag sections that are too short: Abstract (<150 words), Introduction (<700 words), Related Work (<500 words), Method (<800 words), Experiments (<600 words), Results (<500 words). A paper with fewer than 4000 total words is CRITICALLY under-length.

      5. REVIEW LIKE A TOP-CONFERENCE REVIEWER:
      - Is the contribution novel, or is it incremental over well-known work?
      - Are baselines properly tuned and competitive?
      - Are ablation studies present and meaningful?
      - Is every claim supported by evidence from the experiments?
      - Does the paper acknowledge its limitations honestly?
      - Would you recommend this paper be presented at NeurIPS/ICML? Why or why not?
      - Score the paper 1-10 following this rubric: 1-3 Reject (fundamental flaws), 4-5 Borderline (significant weaknesses), 6-7 Weak Accept (solid but not exciting), 8-9 Accept (strong contribution), 10 Strong Accept (exceptional).

      Paper draft:

      {draft}

      {experiment_evidence}'
  problem_decompose:
    system: You are a senior research strategist.
    user: 'Decompose this research problem into at least 4 prioritized sub-questions.

      Topic: {topic}

      Output markdown with sections: Source, Sub-questions, Priority Ranking, Risks.

      Goal context:

      {goal_text}'
  quality_gate:
    json_mode: true
    system: You are a final quality gate evaluator.
    user: 'Evaluate revised paper quality and return JSON.

      Schema: {score_1_to_10:number, verdict:string, strengths:[...], weaknesses:[...], required_actions:[...]}.

      Threshold: {quality_threshold}

      Paper:

      {revised}'
  research_decision:
    system: You are a research program lead making go/no-go decisions.
    user: 'Make a PROCEED or PIVOT decision from analysis.

      Output markdown with: Decision, Justification, Evidence, Next Actions.

      Analysis:

      {analysis}'
  resource_planning:
    json_mode: true
    system: You are an experiment scheduler.
    user: 'Create schedule JSON with GPU/time estimates.

      Schema: {tasks:[{id,name,depends_on,gpu_count,estimated_minutes,priority}], total_gpu_budget, generated}.

      Experiment plan:

      {exp_plan}'
  result_analysis:
    system: You are a quantitative ML analyst. Always cite exact numbers from the provided data.
    user: '{preamble}


      {data_context}


      Analyze run metrics and produce markdown report with statistical interpretation.

      Use the ACTUAL quantitative values provided above — do NOT invent numbers.

      Required sections: Metrics Summary (with real values), Comparative Findings, Statistical Checks, Limitations, Conclusion.

      Run context:

      {context}'
  search_strategy:
    json_mode: true
    system: You design literature retrieval strategies and source verification plans. You aim for COMPREHENSIVE coverage — a good research paper needs 30-60 references.
    user: 'Create a merged search strategy package.

      Return a JSON object with keys: search_plan_yaml, sources.

      search_plan_yaml must be valid YAML text with search_strategies containing at least 3 strategies,
      each with 3-5 diverse keyword queries (short, 3-6 words each). Generate at least 8 total queries.
      Cover: core topic, related methods, benchmarks/datasets, theoretical foundations, applications.

      sources must include id,name,type,url,status,query,verified_at.

      Topic: {topic}

      Problem tree:

      {problem_tree}'
  synthesis:
    system: You are a synthesis specialist for literature reviews.
    user: 'Produce merged synthesis output (topic clusters + research gaps).

      Output markdown with sections: Cluster Overview, Cluster 1..N, Gap 1..N, Prioritized Opportunities.

      Topic: {topic}

      Cards context:

      {cards_context}'
  topic_init:
    system: You are a rigorous research planner.
    user: 'Create a SMART research goal in markdown.

      Topic: {topic}

      Domains: {domains}

      Project: {project_name}

      Quality threshold: {quality_threshold}

      Required sections: Topic, Scope, SMART Goal, Constraints, Success Criteria, Generated.'
sub_prompts:
  code_repair:
    system: You fix Python code validation errors while preserving functionality.
    user: 'The file `{fname}` in the experiment project has validation errors. Fix ALL issues and return ONLY the corrected
      file.


      ## Validation Issues in {fname}

      {issues_text}


      ## All Project Files

      {all_files_ctx}


      IMPORTANT: Do NOT use subprocess, os.system, eval, exec, or any network/shell calls.

      Return ONLY the corrected code for `{fname}`.'
  iterative_improve:
    max_tokens: 8192
    system: You improve experiment projects and return valid executable Python code. Use ```filename:xxx.py format for each
      file.
    user: 'Improve the experiment code based on prior run results.

      Return the improved files using ```filename:xxx.py format for each file.

      Primary metric key: {metric_key}

      Metric direction: {metric_direction}

      Do not use subprocess, os.system, eval, exec, or any network/shell calls.

      Current project files:

      {files_context}

      Run summaries (JSON):

      {run_summaries}'
  iterative_repair:
    system: You fix Python code issues — both static validation errors and runtime
      bugs (NaN, Inf, division by zero, overflow). Diagnose the ROOT CAUSE from
      warnings and error messages. Do not add unsafe behavior.
    user: 'Fix all issues in the experiment code and return corrected Python code
      using ```filename:xxx.py format for each file.

      IMPORTANT: If you see NaN/Inf or RuntimeWarning about division or invalid values,
      trace the bug to its source (e.g. division by zero, uninitialized array, missing
      convergence check) and fix the actual code logic — do NOT just add try/except
      to suppress the error.


      ## Issues Found

      {issue_text}


      ## All Project Files

      {all_files_ctx}'
version: '1.0'


================================================
FILE: pyproject.toml
================================================
[project]
name = "researchclaw"
version = "0.3.1"
description = "ResearchClaw — Autonomous Research Pipeline. Turn any research idea into a paper."
requires-python = ">=3.11"
dependencies = [
    "pyyaml>=6.0",
    "rich>=13.0",
    "arxiv>=2.1",
    "numpy>=1.24",
]
readme = "README.md"
license = {text = "MIT"}

[project.optional-dependencies]
anthropic = ["httpx>=0.24"]
web = ["scholarly>=1.7", "crawl4ai>=0.2", "tavily-python>=0.3"]
pdf = ["PyMuPDF>=1.23"]
all = [
    "httpx>=0.24",
    "scholarly>=1.7",
    "crawl4ai>=0.2",
    "tavily-python>=0.3",
    "PyMuPDF>=1.23",
    "huggingface-hub>=0.20",
    "matplotlib>=3.7",
    "scipy>=1.10",
]
dev = ["pytest>=7.0", "httpx>=0.24"]

[project.scripts]
researchclaw = "researchclaw.cli:main"

[tool.hatch.build.targets.wheel]
packages = ["researchclaw", "sibyl", "arc"]

[tool.hatch.build.targets.wheel.force-include]
"researchclaw/templates/styles" = "researchclaw/templates/styles"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"


================================================
FILE: researchclaw/__init__.py
================================================
"""ResearchClaw — Autonomous Research Pipeline."""

__version__ = "0.3.1"


================================================
FILE: researchclaw/__main__.py
================================================
"""Allow running as `python -m researchclaw`."""

import sys
from researchclaw.cli import main

sys.exit(main())


================================================
FILE: researchclaw/adapters.py
================================================
"""Typed adapter interfaces and deterministic recording stubs."""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Protocol


@dataclass(frozen=True)
class FetchResponse:
    url: str
    status_code: int
    text: str


@dataclass(frozen=True)
class BrowserPage:
    url: str
    title: str


class CronAdapter(Protocol):
    def schedule_resume(self, run_id: str, stage_id: int, reason: str) -> str: ...


class MessageAdapter(Protocol):
    def notify(self, channel: str, subject: str, body: str) -> str: ...


class MemoryAdapter(Protocol):
    def append(self, namespace: str, content: str) -> str: ...


class SessionsAdapter(Protocol):
    def spawn(self, name: str, command: tuple[str, ...]) -> str: ...


class WebFetchAdapter(Protocol):
    def fetch(self, url: str) -> FetchResponse: ...


class BrowserAdapter(Protocol):
    def open(self, url: str) -> BrowserPage: ...


@dataclass
class RecordingCronAdapter:
    calls: list[tuple[str, int, str]] = field(default_factory=list)

    def schedule_resume(self, run_id: str, stage_id: int, reason: str) -> str:
        self.calls.append((run_id, stage_id, reason))
        return f"cron-{len(self.calls)}"


@dataclass
class RecordingMessageAdapter:
    calls: list[tuple[str, str, str]] = field(default_factory=list)

    def notify(self, channel: str, subject: str, body: str) -> str:
        self.calls.append((channel, subject, body))
        return f"message-{len(self.calls)}"


@dataclass
class RecordingMemoryAdapter:
    entries: list[tuple[str, str]] = field(default_factory=list)

    def append(self, namespace: str, content: str) -> str:
        self.entries.append((namespace, content))
        return f"memory-{len(self.entries)}"


@dataclass
class RecordingSessionsAdapter:
    calls: list[tuple[str, tuple[str, ...]]] = field(default_factory=list)

    def spawn(self, name: str, command: tuple[str, ...]) -> str:
        self.calls.append((name, command))
        return f"session-{len(self.calls)}"


@dataclass
class RecordingWebFetchAdapter:
    calls: list[str] = field(default_factory=list)

    def fetch(self, url: str) -> FetchResponse:
        self.calls.append(url)
        return FetchResponse(url=url, status_code=200, text=f"stub fetch for {url}")


@dataclass
class RecordingBrowserAdapter:
    calls: list[str] = field(default_factory=list)

    def open(self, url: str) -> BrowserPage:
        self.calls.append(url)
        return BrowserPage(url=url, title=f"Stub browser page for {url}")


@dataclass
class MCPMessageAdapter:
    """MessageAdapter backed by an MCP tool call."""

    server_uri: str = "http://localhost:3000"

    def notify(self, channel: str, subject: str, body: str) -> str:
        return f"mcp-notify-{channel}"


@dataclass
class MCPWebFetchAdapter:
    """WebFetchAdapter backed by an MCP tool call."""

    server_uri: str = "http://localhost:3000"

    def fetch(self, url: str) -> FetchResponse:
        return FetchResponse(url=url, status_code=200, text=f"mcp fetch for {url}")


@dataclass
class AdapterBundle:
    cron: CronAdapter = field(default_factory=RecordingCronAdapter)
    message: MessageAdapter = field(default_factory=RecordingMessageAdapter)
    memory: MemoryAdapter = field(default_factory=RecordingMemoryAdapter)
    sessions: SessionsAdapter = field(default_factory=RecordingSessionsAdapter)
    web_fetch: WebFetchAdapter = field(default_factory=RecordingWebFetchAdapter)
    browser: BrowserAdapter = field(default_factory=RecordingBrowserAdapter)

    @classmethod
    def from_config(cls, config: object) -> AdapterBundle:
        """Build an AdapterBundle from RCConfig, wiring MCP adapters when enabled."""
        bundle = cls()
        mcp_cfg = getattr(config, "mcp", None)
        if mcp_cfg and getattr(mcp_cfg, "server_enabled", False):
            uri = f"http://localhost:{getattr(mcp_cfg, 'server_port', 3000)}"
            bundle.message = MCPMessageAdapter(server_uri=uri)
            bundle.web_fetch = MCPWebFetchAdapter(server_uri=uri)
        return bundle


================================================
FILE: researchclaw/agents/__init__.py
================================================
"""Multi-agent subsystems for AutoResearchClaw pipeline."""


================================================
FILE: researchclaw/agents/base.py
================================================
"""Base classes for multi-agent subsystems.

Provides ``BaseAgent`` (individual agent) and ``AgentOrchestrator``
(coordinator for multi-agent workflows).  Both use the existing
``LLMClient`` for model calls and follow the same structural-typing
conventions as ``CodeAgent``.
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass, field
from typing import Any, Protocol

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# LLM protocol (structural typing — no import dependency on llm.client)
# ---------------------------------------------------------------------------


class _LLMResponseLike(Protocol):  # pragma: no cover
    content: str
    model: str
    prompt_tokens: int
    completion_tokens: int


class _LLMClientLike(Protocol):  # pragma: no cover
    def chat(
        self,
        messages: list[dict[str, str]],
        *,
        system: str | None = None,
        max_tokens: int | None = None,
        temperature: float | None = None,
        json_mode: bool = False,
    ) -> Any: ...


# ---------------------------------------------------------------------------
# Agent result
# ---------------------------------------------------------------------------


@dataclass
class AgentStepResult:
    """Output from a single agent step."""

    success: bool
    data: dict[str, Any] = field(default_factory=dict)
    error: str = ""
    llm_calls: int = 0
    token_usage: int = 0


# ---------------------------------------------------------------------------
# Base agent
# ---------------------------------------------------------------------------


class BaseAgent:
    """Base class for all sub-agents in a multi-agent system.

    Subclasses must implement ``execute(context) -> AgentStepResult``.
    """

    name: str = "base"

    def __init__(self, llm: _LLMClientLike) -> None:
        self._llm = llm
        self._calls = 0
        self._tokens = 0
        self.logger = logging.getLogger(f"{__name__}.{self.name}")

    # -- LLM helpers -------------------------------------------------------

    def _chat(
        self,
        system: str,
        user: str,
        *,
        max_tokens: int = 4096,
        temperature: float = 0.4,
        json_mode: bool = False,
    ) -> str:
        """Send a chat message and return the content string."""
        self._calls += 1
        resp = self._llm.chat(
            [{"role": "user", "content": user}],
            system=system,
            max_tokens=max_tokens,
            temperature=temperature,
            json_mode=json_mode,
        )
        self._tokens += getattr(resp, "total_tokens", 0)
        return resp.content

    def _chat_json(
        self,
        system: str,
        user: str,
        *,
        max_tokens: int = 4096,
        temperature: float = 0.3,
    ) -> dict[str, Any]:
        """Send a chat message expecting JSON output.  Falls back to regex extraction."""
        raw = self._chat(
            system, user,
            max_tokens=max_tokens,
            temperature=temperature,
            json_mode=True,
        )
        return self._parse_json(raw) or {}

    # -- JSON parsing (3-tier, matching CodeAgent convention) ---------------

    @staticmethod
    def _parse_json(text: str) -> dict[str, Any] | None:
        """Try to extract JSON from text using three strategies.

        Always returns a ``dict`` or ``None`` — lists and other JSON
        primitives are discarded so callers can safely use ``.get()``.
        """
        def _as_dict(val: Any) -> dict[str, Any] | None:
            return val if isinstance(val, dict) else None

        # 1. Direct parse
        try:
            return _as_dict(json.loads(text))
        except (json.JSONDecodeError, ValueError):
            pass
        # 2. Fenced code block
        m = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL)
        if m:
            try:
                return _as_dict(json.loads(m.group(1)))
            except (json.JSONDecodeError, ValueError):
                pass
        # 3. First balanced { ... } block (BUG-DA6-07: use non-greedy brace matching)
        depth = 0
        start_idx = -1
        for i, ch in enumerate(text):
            if ch == "{":
                if depth == 0:
                    start_idx = i
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0 and start_idx >= 0:
                    candidate = text[start_idx : i + 1]
                    try:
                        return _as_dict(json.loads(candidate))
                    except (json.JSONDecodeError, ValueError):
                        start_idx = -1  # try next top-level block
        return None

    # -- Subclass API ------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Execute the agent's task.  Must be overridden."""
        raise NotImplementedError

    def _make_result(
        self, success: bool, data: dict[str, Any] | None = None, error: str = "",
    ) -> AgentStepResult:
        # BUG-DA6-01: Return per-call delta, then reset counters to avoid
        # double-counting when the same agent instance is reused across retries.
        calls, tokens = self._calls, self._tokens
        self._calls = 0
        self._tokens = 0
        return AgentStepResult(
            success=success,
            data=data or {},
            error=error,
            llm_calls=calls,
            token_usage=tokens,
        )


# ---------------------------------------------------------------------------
# Orchestrator
# ---------------------------------------------------------------------------


class AgentOrchestrator:
    """Coordinates a sequence of agents with optional retry loops.

    Subclasses implement ``orchestrate(context) -> dict`` which defines the
    specific workflow (sequential, branching, iterative, etc.).
    """

    def __init__(self, llm: _LLMClientLike, *, max_iterations: int = 3) -> None:
        self._llm = llm
        self.max_iterations = max_iterations
        self.logger = logging.getLogger(f"{__name__}.orchestrator")
        self.total_llm_calls = 0
        self.total_tokens = 0

    def _accumulate(self, result: AgentStepResult) -> None:
        """Track cumulative LLM usage."""
        self.total_llm_calls += result.llm_calls
        self.total_tokens += result.token_usage

    def orchestrate(self, context: dict[str, Any]) -> dict[str, Any]:
        """Run the multi-agent workflow.  Must be overridden."""
        raise NotImplementedError


================================================
FILE: researchclaw/agents/benchmark_agent/__init__.py
================================================
"""BenchmarkAgent — multi-agent benchmark, dataset, and baseline selection.

Architecture
------------
1. **Surveyor** — searches HuggingFace Hub + local knowledge base for
   domain-relevant benchmarks, datasets, and baseline methods.
2. **Selector** — filters and ranks candidates based on hardware constraints,
   time budget, network policy, and tier availability.
3. **Acquirer** — generates data-loading code snippets, ``setup.py`` download
   scripts, baseline boilerplate, and ``requirements.txt`` entries.
4. **Validator** — validates generated code for syntax correctness and
   API compatibility.

The ``BenchmarkOrchestrator`` coordinates the four agents and produces a
``BenchmarkPlan`` consumed by downstream pipeline stages (experiment design,
code generation).
"""

from researchclaw.agents.benchmark_agent.orchestrator import (
    BenchmarkOrchestrator,
    BenchmarkPlan,
)

__all__ = ["BenchmarkOrchestrator", "BenchmarkPlan"]


================================================
FILE: researchclaw/agents/benchmark_agent/acquirer.py
================================================
"""Acquirer Agent — generates data loading code and download scripts.

Produces three outputs consumed by the code generation stage:
1. Data loading snippets (``get_datasets()`` function)
2. Baseline method snippets (model instantiation code)
3. ``setup.py`` additions for dataset downloading
"""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.agents.base import AgentStepResult, BaseAgent

logger = logging.getLogger(__name__)


class AcquirerAgent(BaseAgent):
    """Generates data loading, baseline, and download code."""

    name = "acquirer"

    def _generate_data_loader(
        self,
        benchmarks: list[dict[str, Any]],
        topic: str,
    ) -> str:
        """Ask LLM to generate a robust data loading function."""
        bench_specs = []
        for b in benchmarks:
            spec = (
                f"- {b.get('name', 'Unknown')} (tier {b.get('tier', '?')}, "
                f"role: {b.get('role', 'secondary')})\n"
                f"  API: {b.get('api', 'N/A')}\n"
                f"  Metrics: {b.get('metrics', [])}\n"
                f"  Note: {b.get('note', '')}"
            )
            bench_specs.append(spec)

        system = (
            "You are an expert ML engineer. Generate a Python function that loads "
            "and prepares datasets for an ML experiment.\n\n"
            "REQUIREMENTS:\n"
            "- Function signature: def get_datasets(data_root='/workspace/data') -> dict\n"
            "- Returns dict with keys: 'train', 'val', 'test' (each a Dataset or DataLoader)\n"
            "- Include appropriate transforms (normalization, augmentation for training)\n"
            "- Handle both torchvision and HuggingFace datasets APIs\n"
            "- Include proper train/val/test splits\n"
            "- Add error handling with informative messages\n"
            "- For pre-cached datasets (tier 1), use download=False\n"
            "- For downloadable datasets (tier 2), use download=True in setup.py\n"
            "- Include a DATA_CONFIG dict with dataset metadata (num_classes, input_shape, etc.)\n\n"
            "Return ONLY the Python code, no explanation."
        )
        user = (
            f"Research Topic: {topic}\n\n"
            f"Datasets to load:\n" + "\n".join(bench_specs) + "\n\n"
            "Generate the data loading code."
        )
        return self._chat(system, user, max_tokens=4096, temperature=0.2)

    def _generate_baseline_code(
        self,
        baselines: list[dict[str, Any]],
        benchmarks: list[dict[str, Any]],
        topic: str,
    ) -> str:
        """Ask LLM to generate baseline method instantiation code."""
        base_specs = []
        for bl in baselines:
            spec = (
                f"- {bl.get('name', 'Unknown')}\n"
                f"  Source: {bl.get('source', 'N/A')}\n"
                f"  Paper: {bl.get('paper', 'N/A')}"
            )
            base_specs.append(spec)

        primary_bench = next(
            (b for b in benchmarks if b.get("role") == "primary"),
            benchmarks[0] if benchmarks else {},
        )

        system = (
            "You are an expert ML engineer. Generate Python code that instantiates "
            "baseline methods for comparison in an ML experiment.\n\n"
            "REQUIREMENTS:\n"
            "- Function signature: def get_baselines(num_classes, device='cuda') -> dict\n"
            "- Returns dict mapping method_name -> model (nn.Module)\n"
            "- Each model must be ready for training (correct output dimensions)\n"
            "- Use pretrained weights where available (for feature extractors)\n"
            "- Adapt final layer to match num_classes of the target dataset\n"
            "- Include a BASELINES_CONFIG dict with metadata (param_count, paper, etc.)\n"
            "- Handle missing optional packages gracefully\n\n"
            "Return ONLY the Python code, no explanation."
        )
        user = (
            f"Research Topic: {topic}\n"
            f"Primary Dataset: {primary_bench.get('name', 'N/A')} "
            f"({primary_bench.get('classes', '?')} classes)\n\n"
            f"Baseline Methods:\n" + "\n".join(base_specs) + "\n\n"
            "Generate the baseline instantiation code."
        )
        return self._chat(system, user, max_tokens=4096, temperature=0.2)

    def _generate_setup_script(
        self,
        benchmarks: list[dict[str, Any]],
        required_pip: list[str],
    ) -> str:
        """Generate setup.py content for dataset downloading."""
        # Tier 2 datasets need download scripts
        tier2 = [b for b in benchmarks if b.get("tier", 1) >= 2]

        if not tier2 and not required_pip:
            return ""

        lines = [
            '"""Setup script for dataset downloading and environment preparation.',
            '',
            'This script runs during Phase 1 (setup) of the Docker sandbox,',
            'when network access is available. It downloads datasets and installs',
            'any additional dependencies.',
            '"""',
            '',
            'import os',
            'import sys',
            '',
            'DATA_ROOT = "/workspace/data"',
            'HF_CACHE = os.path.join(DATA_ROOT, "hf")',
            '',
            '',
            'def download_datasets():',
            '    """Download all required datasets."""',
            '    os.makedirs(DATA_ROOT, exist_ok=True)',
            '    os.makedirs(HF_CACHE, exist_ok=True)',
            '',
        ]

        for b in tier2:
            api = b.get("api", "")
            name = b.get("name", "unknown")
            if "torchvision" in api:
                # Convert download=False to download=True for setup
                dl_api = api.replace("download=False", "download=True")
                lines.extend([
                    f'    # Download {name}',
                    '    try:',
                    f'        import torchvision',
                    f'        {dl_api}',
                    f'        print(f"Downloaded {name}")',
                    f'    except Exception as e:',
                    f'        print(f"Warning: Failed to download {name}: {{e}}")',
                    '',
                ])
            elif "datasets.load_dataset" in api or "load_dataset" in api:
                # Rewrite qualified `datasets.load_dataset(...)` to
                # `load_dataset(...)` so it matches the `from datasets import`
                _dl_api = api.replace("datasets.load_dataset", "load_dataset")
                lines.extend([
                    f'    # Download {name}',
                    '    try:',
                    f'        from datasets import load_dataset',
                    f'        {_dl_api}',
                    f'        print(f"Downloaded {name}")',
                    f'    except Exception as e:',
                    f'        print(f"Warning: Failed to download {name}: {{e}}")',
                    '',
                ])
            elif "PygNodePropPredDataset" in api or "PygGraphPropPredDataset" in api:
                lines.extend([
                    f'    # Download {name}',
                    '    try:',
                    f'        from ogb.nodeproppred import PygNodePropPredDataset' if 'Node' in api
                    else f'        from ogb.graphproppred import PygGraphPropPredDataset',
                    f'        {api}',
                    f'        print(f"Downloaded {name}")',
                    f'    except Exception as e:',
                    f'        print(f"Warning: Failed to download {name}: {{e}}")',
                    '',
                ])

        lines.extend([
            '',
            'if __name__ == "__main__":',
            '    download_datasets()',
            '    print("Setup complete.")',
        ])

        return "\n".join(lines)

    def _generate_requirements(self, required_pip: list[str]) -> str:
        """Generate requirements.txt content for additional packages."""
        if not required_pip:
            return ""
        # Filter out packages that are already in the Docker image
        builtin = {
            "torch", "torchvision", "torchaudio", "numpy", "scipy",
            "sklearn", "scikit-learn", "pandas", "matplotlib", "seaborn",
            "tqdm", "gymnasium", "networkx", "timm", "einops",
            "torchmetrics", "transformers", "datasets", "accelerate",
            "peft", "trl", "bitsandbytes", "tokenizers", "safetensors",
            "h5py", "tensorboard", "pillow", "pyyaml", "kornia",
            "albumentations",
        }
        extra = [p for p in required_pip if p.lower() not in builtin]
        return "\n".join(extra) if extra else ""

    # -- Code cleanup ------------------------------------------------------

    @staticmethod
    def _strip_fences(code: str) -> str:
        """Remove markdown code fences if present."""
        code = code.strip()
        if code.startswith("```"):
            # Remove opening fence
            first_nl = code.index("\n") if "\n" in code else len(code)
            code = code[first_nl + 1:]
        if code.endswith("```"):
            code = code[:-3].rstrip()
        return code

    # -- Main entry point --------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Generate data loading, baseline, and download code.

        Context keys:
            topic (str): Research topic
            selection (dict): Output from SelectorAgent
        """
        topic = context.get("topic", "")
        selection = context.get("selection", {})

        benchmarks = selection.get("selected_benchmarks", [])
        baselines = selection.get("selected_baselines", [])
        required_pip = selection.get("required_pip", [])

        if not benchmarks:
            return self._make_result(False, error="No benchmarks selected")

        # 1. Generate data loading code
        self.logger.info("Generating data loading code for %d datasets", len(benchmarks))
        data_loader_code = self._strip_fences(
            self._generate_data_loader(benchmarks, topic)
        )

        # 2. Generate baseline code
        baseline_code = ""
        if baselines:
            self.logger.info("Generating baseline code for %d methods", len(baselines))
            baseline_code = self._strip_fences(
                self._generate_baseline_code(baselines, benchmarks, topic)
            )

        # 3. Generate setup.py
        setup_code = self._generate_setup_script(benchmarks, required_pip)

        # 4. Generate requirements.txt
        requirements = self._generate_requirements(required_pip)

        result = {
            "data_loader_code": data_loader_code,
            "baseline_code": baseline_code,
            "setup_code": setup_code,
            "requirements": requirements,
            "benchmark_names": [b.get("name", "Unknown") for b in benchmarks],
            "baseline_names": [bl.get("name", "Unknown") for bl in baselines],
        }

        self.logger.info("Acquirer complete: %d code artifacts generated",
                         sum(1 for v in result.values() if v))

        return self._make_result(True, data=result)


================================================
FILE: researchclaw/agents/benchmark_agent/orchestrator.py
================================================
"""BenchmarkAgent Orchestrator — coordinates the four sub-agents.

Flow: Surveyor → Selector → Acquirer → Validator (→ retry if failed)

Produces a ``BenchmarkPlan`` consumed by experiment design and code
generation stages.
"""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from researchclaw.agents.base import AgentOrchestrator
from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
from researchclaw.agents.benchmark_agent.selector import SelectorAgent
from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
from researchclaw.agents.benchmark_agent.validator import ValidatorAgent

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class BenchmarkAgentConfig:
    """Configuration for the BenchmarkAgent system."""

    enabled: bool = True
    # Surveyor
    enable_hf_search: bool = True
    max_hf_results: int = 10
    enable_web_search: bool = False
    max_web_results: int = 5
    web_search_min_local: int = 3
    # Selector
    tier_limit: int = 2
    min_benchmarks: int = 1
    min_baselines: int = 2
    prefer_cached: bool = True
    # Orchestrator
    max_iterations: int = 2  # max Acquirer→Validator retry loops


# ---------------------------------------------------------------------------
# Output data structure
# ---------------------------------------------------------------------------


@dataclass
class BenchmarkPlan:
    """Final output from the BenchmarkAgent system.

    Consumed by:
    - Experiment design stage (selected benchmarks/baselines for plan)
    - Code generation stage (data_loader_code, baseline_code)
    - Docker sandbox (setup_code, requirements)
    """

    # Selected items
    selected_benchmarks: list[dict[str, Any]] = field(default_factory=list)
    selected_baselines: list[dict[str, Any]] = field(default_factory=list)
    matched_domains: list[str] = field(default_factory=list)

    # Generated code
    data_loader_code: str = ""
    baseline_code: str = ""
    setup_code: str = ""
    requirements: str = ""

    # Metadata
    rationale: str = ""
    experiment_notes: str = ""
    validation_passed: bool = False
    validation_warnings: list[str] = field(default_factory=list)

    # Stats
    total_llm_calls: int = 0
    total_tokens: int = 0
    elapsed_sec: float = 0.0

    def to_dict(self) -> dict[str, Any]:
        """Serialize to a JSON-safe dict."""
        return {
            "selected_benchmarks": self.selected_benchmarks,
            "selected_baselines": self.selected_baselines,
            "matched_domains": self.matched_domains,
            "data_loader_code": self.data_loader_code,
            "baseline_code": self.baseline_code,
            "setup_code": self.setup_code,
            "requirements": self.requirements,
            "rationale": self.rationale,
            "experiment_notes": self.experiment_notes,
            "validation_passed": self.validation_passed,
            "validation_warnings": self.validation_warnings,
            "total_llm_calls": self.total_llm_calls,
            "total_tokens": self.total_tokens,
            "elapsed_sec": self.elapsed_sec,
        }

    def to_prompt_block(self) -> str:
        """Format as a prompt block for injection into code generation."""
        parts = []

        # Benchmark summary
        if self.selected_benchmarks:
            parts.append("## Selected Benchmarks")
            for b in self.selected_benchmarks:
                role = b.get("role", "secondary")
                metrics = b.get("metrics", [])
                parts.append(
                    f"- **{b.get('name', 'Unknown')}** ({role}) — "
                    f"metrics: {', '.join(str(m) for m in metrics)}"
                )
                if b.get("api"):
                    parts.append(f"  API: `{b['api']}`")
                if b.get("note"):
                    parts.append(f"  Note: {b['note']}")

        # Baseline summary
        if self.selected_baselines:
            parts.append("\n## Selected Baselines")
            for bl in self.selected_baselines:
                parts.append(
                    f"- **{bl.get('name', 'Unknown')}**: {bl.get('paper', 'N/A')}"
                )
                if bl.get("source"):
                    parts.append(f"  Code: `{bl['source']}`")

        # Data loading code
        if self.data_loader_code:
            parts.append("\n## Data Loading Code (READY TO USE)")
            parts.append("```python")
            parts.append(self.data_loader_code)
            parts.append("```")

        # Baseline code
        if self.baseline_code:
            parts.append("\n## Baseline Methods Code (READY TO USE)")
            parts.append("```python")
            parts.append(self.baseline_code)
            parts.append("```")

        # Experiment notes
        if self.experiment_notes:
            parts.append(f"\n## Experiment Notes\n{self.experiment_notes}")

        return "\n".join(parts)


# ---------------------------------------------------------------------------
# Orchestrator
# ---------------------------------------------------------------------------


class BenchmarkOrchestrator(AgentOrchestrator):
    """Coordinates Surveyor → Selector → Acquirer → Validator pipeline."""

    def __init__(
        self,
        llm: Any,
        config: BenchmarkAgentConfig | None = None,
        *,
        gpu_memory_mb: int = 49000,
        time_budget_sec: int = 300,
        network_policy: str = "setup_only",
        stage_dir: Path | None = None,
    ) -> None:
        cfg = config or BenchmarkAgentConfig()
        super().__init__(llm, max_iterations=cfg.max_iterations)

        self._config = cfg
        self._stage_dir = stage_dir

        # Initialize sub-agents
        self._surveyor = SurveyorAgent(
            llm,
            enable_hf_search=cfg.enable_hf_search,
            max_hf_results=cfg.max_hf_results,
        )
        self._selector = SelectorAgent(
            llm,
            gpu_memory_mb=gpu_memory_mb,
            time_budget_sec=time_budget_sec,
            network_policy=network_policy,
            tier_limit=cfg.tier_limit,
            min_benchmarks=cfg.min_benchmarks,
            min_baselines=cfg.min_baselines,
            prefer_cached=cfg.prefer_cached,
        )
        self._acquirer = AcquirerAgent(llm)
        self._validator = ValidatorAgent(llm)

    def _save_artifact(self, name: str, data: Any) -> None:
        """Save intermediate artifact to stage directory."""
        if self._stage_dir is None:
            return
        self._stage_dir.mkdir(parents=True, exist_ok=True)
        path = self._stage_dir / name
        if isinstance(data, str):
            path.write_text(data, encoding="utf-8")
        else:
            path.write_text(
                json.dumps(data, indent=2, ensure_ascii=False, default=str),
                encoding="utf-8",
            )

    def orchestrate(self, context: dict[str, Any]) -> BenchmarkPlan:
        """Run the full benchmark selection pipeline.

        Context keys:
            topic (str): Research topic/title
            hypothesis (str): Research hypothesis
            experiment_plan (str): Experiment plan text
        """
        t0 = time.monotonic()
        topic = context.get("topic", "")
        hypothesis = context.get("hypothesis", "")

        self.logger.info("BenchmarkAgent starting for: %s", topic[:80])

        plan = BenchmarkPlan()

        # ── Phase 1: Survey ───────────────────────────────────────
        self.logger.info("Phase 1: Surveying benchmarks")
        survey_result = self._surveyor.execute({
            "topic": topic,
            "hypothesis": hypothesis,
            "experiment_plan": context.get("experiment_plan", ""),
        })
        self._accumulate(survey_result)

        if not survey_result.success:
            self.logger.warning("Survey failed: %s", survey_result.error)
            plan.elapsed_sec = time.monotonic() - t0
            plan.total_llm_calls = self.total_llm_calls
            plan.total_tokens = self.total_tokens
            return plan

        survey = survey_result.data
        plan.matched_domains = survey.get("matched_domains", [])
        self._save_artifact("survey_results.json", survey)

        # ── Phase 2: Select ───────────────────────────────────────
        self.logger.info("Phase 2: Selecting benchmarks and baselines")
        select_result = self._selector.execute({
            "topic": topic,
            "survey": survey,
        })
        self._accumulate(select_result)

        if not select_result.success:
            self.logger.warning("Selection failed: %s", select_result.error)
            plan.elapsed_sec = time.monotonic() - t0
            plan.total_llm_calls = self.total_llm_calls
            plan.total_tokens = self.total_tokens
            return plan

        selection = select_result.data
        plan.selected_benchmarks = selection.get("selected_benchmarks", [])
        plan.selected_baselines = selection.get("selected_baselines", [])
        plan.rationale = selection.get("rationale", "")
        plan.experiment_notes = selection.get("experiment_notes", "")
        self._save_artifact("selection_results.json", selection)

        # ── Phase 3+4: Acquire + Validate (with retry) ───────────
        for iteration in range(self.max_iterations):
            self.logger.info(
                "Phase 3: Acquiring code (iteration %d/%d)",
                iteration + 1, self.max_iterations,
            )

            # Acquire
            acq_result = self._acquirer.execute({
                "topic": topic,
                "selection": selection,
            })
            self._accumulate(acq_result)

            if not acq_result.success:
                self.logger.warning("Acquisition failed: %s", acq_result.error)
                continue

            acquisition = acq_result.data
            self._save_artifact(
                f"acquisition_{iteration}.json",
                {k: v for k, v in acquisition.items()
                 if k not in ("data_loader_code", "baseline_code", "setup_code")},
            )

            # Validate
            self.logger.info("Phase 4: Validating code (iteration %d/%d)",
                             iteration + 1, self.max_iterations)
            val_result = self._validator.execute({
                "acquisition": acquisition,
            })
            self._accumulate(val_result)

            validation = val_result.data
            self._save_artifact(f"validation_{iteration}.json", validation)

            # Store results
            plan.data_loader_code = acquisition.get("data_loader_code", "")
            plan.baseline_code = acquisition.get("baseline_code", "")
            plan.setup_code = acquisition.get("setup_code", "")
            plan.requirements = acquisition.get("requirements", "")
            plan.validation_passed = validation.get("passed", False)
            plan.validation_warnings = validation.get("warnings", [])

            if plan.validation_passed:
                self.logger.info("Validation passed on iteration %d", iteration + 1)
                break

            self.logger.warning(
                "Validation failed (iteration %d): %s",
                iteration + 1,
                validation.get("errors", []),
            )

        # ── Finalize ──────────────────────────────────────────────
        plan.total_llm_calls = self.total_llm_calls
        plan.total_tokens = self.total_tokens
        plan.elapsed_sec = time.monotonic() - t0

        # Save final plan
        self._save_artifact("benchmark_plan.json", plan.to_dict())

        self.logger.info(
            "BenchmarkAgent complete: %d benchmarks, %d baselines, "
            "validation=%s, %d LLM calls, %.1fs",
            len(plan.selected_benchmarks),
            len(plan.selected_baselines),
            "PASS" if plan.validation_passed else "FAIL",
            plan.total_llm_calls,
            plan.elapsed_sec,
        )

        return plan


================================================
FILE: researchclaw/agents/benchmark_agent/selector.py
================================================
"""Selector Agent — filters and ranks benchmark candidates.

Applies hardware constraints, time budget, network policy, and tier
priorities to select the optimal combination of datasets and baselines.
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

import yaml

from researchclaw.agents.base import AgentStepResult, BaseAgent

logger = logging.getLogger(__name__)

_KNOWLEDGE_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "benchmark_knowledge.yaml"

# Maximum dataset size (MB) by tier and network policy
_SIZE_LIMITS: dict[str, int] = {
    "none": 0,          # No download allowed — tier 1 only
    "setup_only": 5000,  # Can download during setup phase
    "pip_only": 0,       # pip only, no data download
    "full": 50000,       # Generous limit
}


class SelectorAgent(BaseAgent):
    """Filters and ranks datasets/baselines based on constraints."""

    name = "selector"

    def __init__(
        self,
        llm: Any,
        *,
        gpu_memory_mb: int = 49000,
        time_budget_sec: int = 300,
        network_policy: str = "setup_only",
        tier_limit: int = 2,
        min_benchmarks: int = 1,
        min_baselines: int = 2,
        prefer_cached: bool = True,
    ) -> None:
        super().__init__(llm)
        self._gpu_mb = gpu_memory_mb
        self._time_sec = time_budget_sec
        self._network_policy = network_policy
        self._tier_limit = tier_limit
        self._min_bench = min_benchmarks
        self._min_base = min_baselines
        self._prefer_cached = prefer_cached

    # -- Filtering ---------------------------------------------------------

    def _filter_benchmarks(
        self, benchmarks: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Filter benchmarks by tier, size, and network policy."""
        max_size = _SIZE_LIMITS.get(self._network_policy, 5000)
        filtered: list[dict[str, Any]] = []

        for b in benchmarks:
            tier = b.get("tier", 3)
            size = b.get("size_mb", 0)

            # Tier filter
            if tier > self._tier_limit:
                continue

            # Network policy filter
            if tier >= 2 and self._network_policy in ("none", "pip_only"):
                continue

            # Size filter (tier 2+ only — tier 1 is pre-cached)
            if tier >= 2 and size > max_size:
                continue

            filtered.append(b)

        return filtered

    def _filter_baselines(
        self, baselines: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Filter baselines by pip availability."""
        filtered: list[dict[str, Any]] = []
        for bl in baselines:
            pip_deps = bl.get("pip", [])
            # If no network, only allow baselines with no extra pip deps
            if self._network_policy == "none" and pip_deps:
                continue
            filtered.append(bl)
        return filtered

    # -- Ranking -----------------------------------------------------------

    def _rank_benchmarks(
        self, benchmarks: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Sort benchmarks by preference: tier 1 > tier 2, knowledge_base > hf, downloads."""
        def _score(b: dict[str, Any]) -> tuple[int, int, int]:
            tier = b.get("tier", 3)
            # Prefer lower tier (cached first)
            tier_score = -tier if self._prefer_cached else 0
            # Prefer knowledge_base over hf/llm
            origin_score = {
                "knowledge_base": 2,
                "huggingface_hub": 1,
                "llm_suggestion": 0,
            }.get(b.get("origin", ""), 0)
            # Downloads as tiebreaker
            downloads = b.get("downloads", 0)
            return (tier_score, origin_score, downloads)

        return sorted(benchmarks, key=_score, reverse=True)

    def _rank_baselines(
        self, baselines: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Sort baselines: knowledge_base first, fewer deps preferred."""
        def _score(bl: dict[str, Any]) -> tuple[int, int]:
            origin_score = 1 if bl.get("origin") == "knowledge_base" else 0
            dep_score = -len(bl.get("pip", []))
            return (origin_score, dep_score)

        return sorted(baselines, key=_score, reverse=True)

    # -- Selection ---------------------------------------------------------

    def _select_with_llm(
        self,
        topic: str,
        benchmarks: list[dict[str, Any]],
        baselines: list[dict[str, Any]],
    ) -> dict[str, Any]:
        """Ask LLM to make final selection from filtered candidates."""
        bench_summary = "\n".join(
            f"- {b.get('name', 'Unknown')} (tier {b.get('tier', '?')}, "
            f"origin: {b.get('origin', '?')}, "
            f"metrics: {b.get('metrics', [])})"
            for b in benchmarks[:15]
        )
        base_summary = "\n".join(
            f"- {bl.get('name', 'Unknown')}: {bl.get('paper', 'N/A')}"
            for bl in baselines[:10]
        )

        system = (
            "You are an ML experiment design expert. Select the BEST combination "
            "of benchmarks and baselines for a research paper.\n\n"
            "Return JSON:\n"
            "{\n"
            '  "primary_benchmark": "name",\n'
            '  "secondary_benchmarks": ["name1", "name2"],\n'
            '  "selected_baselines": ["name1", "name2", "name3"],\n'
            '  "rationale": "why these choices are optimal",\n'
            '  "experiment_notes": "specific setup guidance"\n'
            "}\n\n"
            "RULES:\n"
            "- Select 1 primary benchmark (the main evaluation dataset)\n"
            "- Select 0-2 secondary benchmarks (additional validation)\n"
            "- Select 2-4 baselines (must include at least 1 classic + 1 recent)\n"
            "- Primary benchmark MUST be the domain standard\n"
            "- Prefer benchmarks that top-venue papers commonly use\n"
            "- Consider dataset size vs time budget\n"
            "- CRITICAL: Only select benchmarks that are RELEVANT to the research "
            "topic's domain. Do NOT select image classification datasets (CIFAR, "
            "MNIST) for non-image tasks like PDE solvers, RL, or optimization.\n"
            "- CRITICAL: Baselines must be COMPETING METHODS, not optimizers. "
            "SGD/Adam/AdamW/Cosine LR are NOT baselines — they are training "
            "tools. Baselines must be alternative approaches to the same problem."
        )
        user = (
            f"Research Topic: {topic}\n\n"
            f"Available Benchmarks:\n{bench_summary}\n\n"
            f"Available Baselines:\n{base_summary}\n\n"
            f"Constraints: GPU={self._gpu_mb}MB, "
            f"time_budget={self._time_sec}s, "
            f"network_policy={self._network_policy}\n\n"
            "Make your selection."
        )
        return self._chat_json(system, user, max_tokens=2048)

    def _resolve_selection(
        self,
        selection: dict[str, Any],
        benchmarks: list[dict[str, Any]],
        baselines: list[dict[str, Any]],
    ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
        """Resolve LLM-selected names back to full benchmark/baseline dicts."""
        # Build name lookup
        bench_map = {b.get("name", f"bench_{i}"): b for i, b in enumerate(benchmarks)}
        base_map = {bl.get("name", f"base_{i}"): bl for i, bl in enumerate(baselines)}

        selected_bench: list[dict[str, Any]] = []
        primary = selection.get("primary_benchmark", "")
        if primary and primary in bench_map:
            entry = {**bench_map[primary], "role": "primary"}
            selected_bench.append(entry)

        for name in selection.get("secondary_benchmarks", []):
            if name in bench_map and name != primary:
                entry = {**bench_map[name], "role": "secondary"}
                selected_bench.append(entry)

        selected_base: list[dict[str, Any]] = []
        for name in selection.get("selected_baselines", []):
            if name in base_map:
                selected_base.append(base_map[name])

        return selected_bench, selected_base

    # -- Required baselines injection --------------------------------------

    def _inject_required_baselines(
        self,
        topic: str,
        selected: list[dict[str, Any]],
        ranked: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Load required_baselines from knowledge base and inject missing ones.

        Returns the list of newly injected baseline dicts.
        """
        try:
            kb = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
            domains = kb.get("domains", {}) if isinstance(kb, dict) else {}
        except Exception:  # noqa: BLE001
            return []

        topic_lower = topic.lower()
        injected: list[dict[str, Any]] = []
        selected_names = {b.get("name", "").lower() for b in selected}

        for _domain_id, domain_data in domains.items():
            if not isinstance(domain_data, dict):
                continue
            keywords = domain_data.get("keywords", [])
            if not any(kw.lower() in topic_lower for kw in keywords):
                continue
            required = domain_data.get("required_baselines", [])
            if not required:
                continue
            # Find each required baseline in ranked list or create stub
            all_baselines = domain_data.get("common_baselines", [])
            bl_by_name = {b.get("name", ""): b for b in all_baselines}
            for req_name in required:
                if req_name.lower() in selected_names:
                    continue
                # Try to find full entry from knowledge base
                if req_name in bl_by_name:
                    entry = {**bl_by_name[req_name], "origin": "required_baseline"}
                else:
                    entry = {"name": req_name, "origin": "required_baseline", "pip": []}
                selected.append(entry)
                selected_names.add(req_name.lower())
                injected.append(entry)

        return injected

    # -- Main entry point --------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Select optimal benchmarks and baselines from survey results.

        Context keys:
            topic (str): Research topic
            survey (dict): Output from SurveyorAgent
        """
        topic = context.get("topic", "")
        survey = context.get("survey", {})

        benchmarks = survey.get("benchmarks", [])
        baselines = survey.get("baselines", [])

        if not benchmarks and not baselines:
            return self._make_result(False, error="No candidates to select from")

        # 1. Filter by constraints
        filtered_bench = self._filter_benchmarks(benchmarks)
        filtered_base = self._filter_baselines(baselines)

        self.logger.info(
            "Filtered: %d/%d benchmarks, %d/%d baselines",
            len(filtered_bench), len(benchmarks),
            len(filtered_base), len(baselines),
        )

        # 2. Rank
        ranked_bench = self._rank_benchmarks(filtered_bench)
        ranked_base = self._rank_baselines(filtered_base)

        # 3. LLM-assisted final selection (if enough candidates)
        if len(ranked_bench) >= 2 or len(ranked_base) >= 2:
            selection = self._select_with_llm(topic, ranked_bench, ranked_base)
            selected_bench, selected_base = self._resolve_selection(
                selection, ranked_bench, ranked_base,
            )
        else:
            # Not enough to warrant LLM call — use top ranked
            # BUG-DA6-06: Create copies to avoid mutating input dicts
            selected_bench = [{**b, "role": "primary"} if i == 0 else {**b, "role": "secondary"}
                              for i, b in enumerate(ranked_bench[:3])]
            selected_base = ranked_base[:self._min_base]
            selection = {}

        # 4. Fallback: ensure minimums
        if len(selected_bench) < self._min_bench and ranked_bench:
            for b in ranked_bench:
                if b not in selected_bench:
                    selected_bench.append({**b, "role": "secondary"})
                if len(selected_bench) >= self._min_bench:
                    break

        if len(selected_base) < self._min_base and ranked_base:
            for bl in ranked_base:
                if bl not in selected_base:
                    selected_base.append(bl)
                if len(selected_base) >= self._min_base:
                    break

        # 4b. Improvement E: Inject required baselines from knowledge base
        _injected_required = self._inject_required_baselines(
            topic, selected_base, ranked_base,
        )
        if _injected_required:
            self.logger.info(
                "Injected %d required baselines: %s",
                len(_injected_required),
                [b.get("name") for b in _injected_required],
            )

        # 5. Collect required pip packages
        required_pip: list[str] = []
        seen_pip: set[str] = set()
        for item in selected_bench + selected_base:
            for pkg in item.get("pip", []):
                if pkg not in seen_pip:
                    seen_pip.add(pkg)
                    required_pip.append(pkg)

        result = {
            "selected_benchmarks": selected_bench,
            "selected_baselines": selected_base,
            "required_pip": required_pip,
            "rationale": selection.get("rationale", ""),
            "experiment_notes": selection.get("experiment_notes", ""),
            "total_filtered": len(filtered_bench),
        }

        self.logger.info(
            "Selected: %d benchmarks, %d baselines, %d pip packages",
            len(selected_bench), len(selected_base), len(required_pip),
        )

        return self._make_result(True, data=result)


================================================
FILE: researchclaw/agents/benchmark_agent/surveyor.py
================================================
"""Surveyor Agent — searches for domain-relevant benchmarks and baselines.

Data sources (in priority order):
1. Local ``benchmark_knowledge.yaml`` — always available, no network.
2. HuggingFace Hub API (``huggingface_hub``) — dataset discovery by task/keyword.
3. LLM fallback — asks the LLM to suggest benchmarks when APIs unavailable.
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

import yaml

from researchclaw.agents.base import AgentStepResult, BaseAgent

logger = logging.getLogger(__name__)

_KNOWLEDGE_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "benchmark_knowledge.yaml"

# ---------------------------------------------------------------------------
# HuggingFace Hub helpers (optional dependency)
# ---------------------------------------------------------------------------

_HF_AVAILABLE = False
try:
    from huggingface_hub import HfApi  # type: ignore[import-untyped]
    _HF_AVAILABLE = True
except ImportError:
    pass

# Mapping from our domain keywords to HuggingFace task_categories filters
_DOMAIN_TO_HF_TASK: dict[str, list[str]] = {
    "image_classification": ["image-classification"],
    "text_classification": ["text-classification", "sentiment-analysis"],
    "language_modeling": ["text-generation"],
    "question_answering": ["question-answering"],
    "generative_models": ["unconditional-image-generation"],
    "graph_neural_networks": ["graph-ml"],
    "reinforcement_learning": ["reinforcement-learning"],
    "tabular_learning": ["tabular-classification", "tabular-regression"],
    "llm_finetuning": ["text-generation"],
}


class SurveyorAgent(BaseAgent):
    """Searches local knowledge base and HuggingFace Hub for benchmarks."""

    name = "surveyor"

    def __init__(
        self,
        llm: Any,
        *,
        enable_hf_search: bool = True,
        max_hf_results: int = 10,
    ) -> None:
        super().__init__(llm)
        self._enable_hf = enable_hf_search and _HF_AVAILABLE
        self._max_hf = max_hf_results
        self._knowledge = self._load_knowledge()

    # -- Knowledge base ----------------------------------------------------

    @staticmethod
    def _load_knowledge() -> dict[str, Any]:
        """Load the local benchmark knowledge base."""
        try:
            data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
            return data.get("domains", {}) if isinstance(data, dict) else {}
        except Exception:  # noqa: BLE001
            logger.warning("Failed to load benchmark_knowledge.yaml", exc_info=True)
            return {}

    def _match_domains(self, topic: str) -> list[str]:
        """Return domain IDs whose keywords appear in the topic."""
        topic_lower = topic.lower()
        matched: list[str] = []
        for domain_id, info in self._knowledge.items():
            keywords = info.get("keywords", [])
            for kw in keywords:
                if kw in topic_lower:
                    matched.append(domain_id)
                    break
        return matched

    def _get_local_candidates(self, domain_ids: list[str]) -> dict[str, Any]:
        """Retrieve benchmarks and baselines from local knowledge base."""
        benchmarks: list[dict[str, Any]] = []
        baselines: list[dict[str, Any]] = []
        seen_bench: set[str] = set()
        seen_base: set[str] = set()

        for did in domain_ids:
            info = self._knowledge.get(did, {})
            for b in info.get("standard_benchmarks", []):
                name = b.get("name", "")
                if name not in seen_bench:
                    seen_bench.add(name)
                    benchmarks.append({**b, "source_domain": did, "origin": "knowledge_base"})
            for bl in info.get("common_baselines", []):
                name = bl.get("name", "")
                if name not in seen_base:
                    seen_base.add(name)
                    baselines.append({**bl, "source_domain": did, "origin": "knowledge_base"})

        return {"benchmarks": benchmarks, "baselines": baselines}

    # -- HuggingFace Hub ---------------------------------------------------

    def _search_hf_datasets(self, topic: str, domain_ids: list[str]) -> list[dict[str, Any]]:
        """Search HuggingFace Hub for relevant datasets."""
        if not self._enable_hf:
            return []

        results: list[dict[str, Any]] = []
        seen: set[str] = set()

        try:
            api = HfApi()

            # Strategy 1: Search by task category
            for did in domain_ids:
                for task_cat in _DOMAIN_TO_HF_TASK.get(did, []):
                    try:
                        datasets = api.list_datasets(
                            filter=[f"task_categories:{task_cat}"],
                            sort="downloads",
                            direction=-1,
                            limit=self._max_hf,
                        )
                        for ds in datasets:
                            if ds.id not in seen:
                                seen.add(ds.id)
                                results.append({
                                    "name": ds.id,
                                    "downloads": getattr(ds, "downloads", 0),
                                    "origin": "huggingface_hub",
                                    "api": f"datasets.load_dataset('{ds.id}', cache_dir='/workspace/data/hf')",
                                    "tier": 2,
                                })
                    except Exception:  # noqa: BLE001
                        logger.debug("HF task search failed for %s", task_cat)

            # Strategy 2: Keyword search on topic
            keywords = self._extract_search_keywords(topic)
            for kw in keywords[:3]:
                try:
                    datasets = api.list_datasets(
                        search=kw,
                        sort="downloads",
                        direction=-1,
                        limit=self._max_hf,
                    )
                    for ds in datasets:
                        if ds.id not in seen:
                            seen.add(ds.id)
                            results.append({
                                "name": ds.id,
                                "downloads": getattr(ds, "downloads", 0),
                                "origin": "huggingface_hub",
                                "api": f"datasets.load_dataset('{ds.id}', cache_dir='/workspace/data/hf')",
                                "tier": 2,
                            })
                except Exception:  # noqa: BLE001
                    logger.debug("HF keyword search failed for %s", kw)

        except Exception as exc:  # noqa: BLE001
            logger.warning("HuggingFace Hub search failed: %s", exc)

        return results

    @staticmethod
    def _extract_search_keywords(topic: str) -> list[str]:
        """Extract 1-3 word search keywords from a topic string."""
        # Remove common filler words to get meaningful search terms
        stop = {
            "a", "an", "the", "for", "in", "on", "of", "to", "with", "and",
            "or", "is", "are", "using", "via", "based", "towards", "novel",
            "new", "improved", "approach", "method", "methods", "study",
        }
        words = [w.lower().strip(".,;:!?()[]") for w in topic.split()]
        filtered = [w for w in words if w and w not in stop and len(w) > 2]
        # Return 2-3 keyword phrases
        keywords: list[str] = []
        if len(filtered) >= 2:
            keywords.append(" ".join(filtered[:2]))
        if len(filtered) >= 3:
            keywords.append(" ".join(filtered[:3]))
        if filtered:
            keywords.append(filtered[0])
        return keywords

    # -- LLM fallback ------------------------------------------------------

    def _llm_suggest_benchmarks(self, topic: str, hypothesis: str) -> dict[str, Any]:
        """Ask LLM to suggest benchmarks and baselines when APIs unavailable."""
        system = (
            "You are an expert ML researcher. Given a research topic and hypothesis, "
            "suggest appropriate benchmarks, datasets, and baseline methods.\n\n"
            "Return a JSON object with:\n"
            "- benchmarks: array of {name, domain, metrics: [], api (Python one-liner), "
            "  tier (1=pre-cached, 2=downloadable), size_mb}\n"
            "- baselines: array of {name, source (Python code), paper (citation), pip: []}\n"
            "- rationale: string explaining why these are the right choices\n\n"
            "CRITICAL RULES:\n"
            "- Benchmarks and baselines MUST be DOMAIN-APPROPRIATE for the topic.\n"
            "- Do NOT suggest image classification datasets (CIFAR, ImageNet, MNIST) "
            "for non-image topics like PDE solvers, RL, combinatorial optimization, etc.\n"
            "- Do NOT suggest optimizers (SGD, Adam, AdamW) as METHOD baselines — "
            "optimizers are training tools, NOT research methods to compare against.\n"
            "- Baselines must be COMPETING METHODS from the same research domain.\n\n"
            "DOMAIN-SPECIFIC GUIDANCE:\n"
            "- Physics/PDE/Scientific computing: Use SYNTHETIC data (Burgers eq, "
            "Darcy flow, Navier-Stokes, heat equation). Baselines: FNO, DeepONet, "
            "PINN, spectral methods.\n"
            "- Combinatorial optimization (TSP, graph coloring, scheduling): Use "
            "SYNTHETIC instances (random TSP, Erdos-Renyi graphs). Baselines: "
            "classical MCTS, LKH, OR-Tools, Concorde, RL-based methods.\n"
            "- Reinforcement learning: Use Gymnasium environments (CartPole, "
            "LunarLander, HalfCheetah). Baselines: PPO, SAC, DQN, TD3.\n"
            "- Graph learning: Use standard graph benchmarks (Cora, CiteSeer, "
            "ogbn-arxiv). Baselines: GCN, GAT, GraphSAGE.\n"
            "- If the domain naturally requires SYNTHETIC data (PDE, optimization, "
            "theoretical analysis), explicitly set tier=1 and api='synthetic' and "
            "describe the data generation procedure in the 'source' field.\n\n"
            "- Prefer well-known, widely-used benchmarks from top venues\n"
            "- Prefer baselines with open-source PyTorch implementations\n"
            "- Include at least 2 datasets and 2 baselines"
        )
        user = (
            f"Research Topic: {topic}\n"
            f"Hypothesis: {hypothesis}\n\n"
            "Suggest appropriate benchmarks, datasets, and baseline methods. "
            "Make sure they are relevant to the specific domain of this research."
        )
        result = self._chat_json(system, user, max_tokens=4096)
        return result

    # -- Main entry point --------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Survey available benchmarks and baselines for the given topic.

        Context keys:
            topic (str): Research topic/title
            hypothesis (str): Research hypothesis
            experiment_plan (str): Experiment plan from previous stages
        """
        topic = context.get("topic", "")
        hypothesis = context.get("hypothesis", "")

        if not topic:
            return self._make_result(False, error="No topic provided")

        self.logger.info("Surveying benchmarks for topic: %s", topic[:80])

        # 1. Match domains from knowledge base
        domain_ids = self._match_domains(topic)
        if hypothesis:
            domain_ids = list(dict.fromkeys(
                domain_ids + self._match_domains(hypothesis)
            ))
        self.logger.info("Matched domains: %s", domain_ids)

        # 2. Get local candidates
        local = self._get_local_candidates(domain_ids)

        # 3. Search HuggingFace Hub (if available)
        hf_datasets = self._search_hf_datasets(topic, domain_ids)

        # 4. LLM fallback if no local matches
        llm_suggestions: dict[str, Any] = {}
        if not local["benchmarks"] and not hf_datasets:
            self.logger.info("No local/HF matches — falling back to LLM")
            llm_suggestions = self._llm_suggest_benchmarks(topic, hypothesis)

        # 5. Combine results
        all_benchmarks = local["benchmarks"] + hf_datasets
        if llm_suggestions.get("benchmarks"):
            for b in llm_suggestions["benchmarks"]:
                b["origin"] = "llm_suggestion"
                all_benchmarks.append(b)

        all_baselines = local["baselines"]
        if llm_suggestions.get("baselines"):
            for bl in llm_suggestions["baselines"]:
                bl["origin"] = "llm_suggestion"
                all_baselines.append(bl)

        survey_result = {
            "matched_domains": domain_ids,
            "benchmarks": all_benchmarks,
            "baselines": all_baselines,
            "hf_datasets_found": len(hf_datasets),
            "llm_fallback_used": bool(llm_suggestions),
            "rationale": llm_suggestions.get("rationale", ""),
        }

        self.logger.info(
            "Survey complete: %d benchmarks, %d baselines, %d HF datasets",
            len(all_benchmarks), len(all_baselines), len(hf_datasets),
        )

        return self._make_result(True, data=survey_result)


================================================
FILE: researchclaw/agents/benchmark_agent/validator.py
================================================
"""Validator Agent — validates generated code for correctness.

Performs three levels of validation:
1. **Syntax check** — ``ast.parse()`` on generated Python code.
2. **Import check** — verifies that referenced modules are importable
   or listed in requirements.
3. **LLM review** — asks the LLM to review code for common pitfalls
   (wrong API usage, missing transforms, incorrect splits).
"""

from __future__ import annotations

import ast
import logging
import re
from typing import Any

from researchclaw.agents.base import AgentStepResult, BaseAgent

logger = logging.getLogger(__name__)

# Packages available in Docker image (no pip install needed)
_BUILTIN_MODULES = {
    "torch", "torchvision", "torchaudio", "numpy", "scipy", "sklearn",
    "pandas", "matplotlib", "seaborn", "tqdm", "gymnasium", "networkx",
    "timm", "einops", "torchmetrics", "transformers", "datasets",
    "accelerate", "peft", "trl", "bitsandbytes", "tokenizers",
    "safetensors", "h5py", "tensorboard", "PIL", "yaml", "kornia",
    "albumentations", "cv2", "mujoco", "os", "sys", "json", "re",
    "pathlib", "typing", "collections", "functools", "itertools",
    "math", "random", "copy", "dataclasses", "abc", "io", "csv",
    "glob", "shutil", "time", "datetime", "logging", "warnings",
    "argparse", "pickle", "struct", "hashlib",
}


class ValidatorAgent(BaseAgent):
    """Validates generated code artifacts for syntax and API correctness."""

    name = "validator"

    def _check_syntax(self, code: str, label: str) -> list[str]:
        """Check Python syntax via ast.parse.  Returns list of errors."""
        if not code.strip():
            return []
        try:
            ast.parse(code)
            return []
        except SyntaxError as e:
            return [f"{label}: SyntaxError at line {e.lineno}: {e.msg}"]

    def _check_imports(
        self,
        code: str,
        label: str,
        extra_requirements: list[str],
    ) -> list[str]:
        """Check that imported modules are available or declared."""
        if not code.strip():
            return []

        warnings: list[str] = []
        # Extract import statements
        import_pattern = re.compile(
            r"^\s*(?:import|from)\s+(\w+)", re.MULTILINE,
        )
        imports = set(import_pattern.findall(code))

        # Build allowed set
        allowed = set(_BUILTIN_MODULES)
        # Map pip package names to import names
        pip_to_import = {
            "torch-geometric": "torch_geometric",
            "ogb": "ogb",
            "stable-baselines3": "stable_baselines3",
            "xgboost": "xgboost",
            "opencv-python": "cv2",
            "scikit-learn": "sklearn",
            "gymnasium[mujoco]": "gymnasium",
            "huggingface_hub": "huggingface_hub",
        }
        for pkg in extra_requirements:
            import_name = pip_to_import.get(pkg, pkg.replace("-", "_"))
            allowed.add(import_name)

        for mod in imports:
            if mod not in allowed:
                warnings.append(
                    f"{label}: import '{mod}' not in Docker image or requirements"
                )

        return warnings

    def _llm_review(
        self,
        data_code: str,
        baseline_code: str,
        setup_code: str,
        benchmark_names: list[str],
        baseline_names: list[str],
    ) -> dict[str, Any]:
        """Ask LLM to review generated code for common pitfalls."""
        system = (
            "You are a code reviewer specializing in ML experiment code. "
            "Review the following generated code for correctness.\n\n"
            "Check for:\n"
            "1. Correct API usage (torchvision, HuggingFace datasets, PyG, etc.)\n"
            "2. Proper data transforms and normalization\n"
            "3. Correct train/val/test split handling\n"
            "4. Compatible input/output dimensions between data and models\n"
            "5. Missing error handling for optional dependencies\n"
            "6. Hardcoded paths that should use variables\n"
            "7. Missing download=True in setup.py for tier 2 datasets\n\n"
            "Return JSON:\n"
            "{\n"
            '  "passed": true/false,\n'
            '  "issues": ["issue 1", "issue 2"],\n'
            '  "suggestions": ["suggestion 1"],\n'
            '  "severity": "none" | "warning" | "error"\n'
            "}"
        )

        code_sections = []
        if data_code:
            code_sections.append(f"## Data Loading Code\n```python\n{data_code}\n```")
        if baseline_code:
            code_sections.append(f"## Baseline Code\n```python\n{baseline_code}\n```")
        if setup_code:
            code_sections.append(f"## Setup Script\n```python\n{setup_code}\n```")

        user = (
            f"Benchmarks: {', '.join(benchmark_names)}\n"
            f"Baselines: {', '.join(baseline_names)}\n\n"
            + "\n\n".join(code_sections)
        )

        return self._chat_json(system, user, max_tokens=2048)

    # -- Main entry point --------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Validate all generated code artifacts.

        Context keys:
            acquisition (dict): Output from AcquirerAgent
        """
        acq = context.get("acquisition", {})

        data_code = acq.get("data_loader_code", "")
        baseline_code = acq.get("baseline_code", "")
        setup_code = acq.get("setup_code", "")
        requirements = acq.get("requirements", "")
        benchmark_names = acq.get("benchmark_names", [])
        baseline_names = acq.get("baseline_names", [])

        extra_pip = [r.strip() for r in requirements.split("\n") if r.strip()]

        all_errors: list[str] = []
        all_warnings: list[str] = []

        # 1. Syntax checks
        for code, label in [
            (data_code, "data_loader"),
            (baseline_code, "baseline"),
            (setup_code, "setup"),
        ]:
            errors = self._check_syntax(code, label)
            all_errors.extend(errors)

        # 2. Import checks
        for code, label in [
            (data_code, "data_loader"),
            (baseline_code, "baseline"),
        ]:
            warnings = self._check_imports(code, label, extra_pip)
            all_warnings.extend(warnings)

        # 3. LLM review (only if no syntax errors)
        llm_review: dict[str, Any] = {}
        if not all_errors:
            llm_review = self._llm_review(
                data_code, baseline_code, setup_code,
                benchmark_names, baseline_names,
            )
            if llm_review.get("severity") == "error":
                all_errors.extend(llm_review.get("issues", []))
            elif llm_review.get("issues"):
                all_warnings.extend(llm_review.get("issues", []))

        passed = len(all_errors) == 0
        severity = "error" if all_errors else ("warning" if all_warnings else "none")

        result = {
            "passed": passed,
            "errors": all_errors,
            "warnings": all_warnings,
            "severity": severity,
            "llm_review": llm_review,
            "suggestions": llm_review.get("suggestions", []),
        }

        self.logger.info(
            "Validation %s: %d errors, %d warnings",
            "PASSED" if passed else "FAILED",
            len(all_errors), len(all_warnings),
        )

        return self._make_result(passed, data=result)


================================================
FILE: researchclaw/agents/code_searcher/__init__.py
================================================
"""Code Searcher agent — searches GitHub for reference code before generation.

This agent searches GitHub repositories and code to find relevant examples
that inform the blueprint generation process, especially for domains where
the LLM's internal knowledge may be insufficient.
"""

from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult

__all__ = ["CodeSearchAgent", "CodeSearchResult"]


================================================
FILE: researchclaw/agents/code_searcher/agent.py
================================================
"""Code Search Agent — orchestrates GitHub search, pattern extraction, and caching.

This is the main entry point for code search. It:
1. Checks cache for existing results
2. Generates search queries (LLM or heuristic)
3. Searches GitHub for repos and code
4. Reads key files from top repos
5. Extracts patterns using LLM
6. Caches results for future use
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any

from researchclaw.agents.code_searcher.cache import SearchCache
from researchclaw.agents.code_searcher.github_client import (
    CodeSnippet,
    GitHubClient,
    RepoAnalysis,
    RepoInfo,
)
from researchclaw.agents.code_searcher.pattern_extractor import CodePatterns, extract_patterns
from researchclaw.agents.code_searcher.query_gen import generate_search_queries
from researchclaw.domains.detector import DomainProfile

logger = logging.getLogger(__name__)


@dataclass
class CodeSearchResult:
    """Complete result from a code search operation."""
    patterns: CodePatterns = field(default_factory=CodePatterns)
    repos_found: list[RepoInfo] = field(default_factory=list)
    snippets_found: list[CodeSnippet] = field(default_factory=list)
    repo_analyses: list[RepoAnalysis] = field(default_factory=list)
    queries_used: list[str] = field(default_factory=list)
    from_cache: bool = False
    github_requests: int = 0

    def to_prompt_context(self) -> str:
        """Format as context block for injection into code generation prompts."""
        if not self.patterns.has_content:
            return ""
        return self.patterns.to_prompt_context()

    def to_cache_dict(self) -> dict[str, Any]:
        """Serialize for caching."""
        return {
            "api_patterns": self.patterns.api_patterns,
            "file_structure": self.patterns.file_structure,
            "evaluation_patterns": self.patterns.evaluation_patterns,
            "library_versions": self.patterns.library_versions,
            "repos": [
                {
                    "full_name": r.full_name,
                    "description": r.description,
                    "stars": r.stars,
                    "html_url": r.html_url,
                }
                for r in self.repos_found[:5]
            ],
            "queries": self.queries_used,
        }

    @classmethod
    def from_cache_dict(cls, data: dict[str, Any]) -> CodeSearchResult:
        """Deserialize from cache."""
        patterns = CodePatterns(
            api_patterns=data.get("api_patterns", []),
            file_structure=data.get("file_structure", {}),
            evaluation_patterns=data.get("evaluation_patterns", []),
            library_versions=data.get("library_versions", {}),
        )
        repos = [
            RepoInfo(
                full_name=r.get("full_name", ""),
                description=r.get("description", ""),
                stars=r.get("stars", 0),
                html_url=r.get("html_url", ""),
            )
            for r in data.get("repos", [])
        ]
        return cls(
            patterns=patterns,
            repos_found=repos,
            queries_used=data.get("queries", []),
            from_cache=True,
        )


class CodeSearchAgent:
    """Orchestrates code search for reference material before code generation.

    Usage::

        agent = CodeSearchAgent(llm=llm_client)
        result = agent.search(
            topic="PDE solver comparison",
            domain=domain_profile,
            specific_needs=["finite element method", "convergence test"],
        )
        context = result.to_prompt_context()
    """

    def __init__(
        self,
        llm: Any | None = None,
        github_token: str | None = None,
        cache: SearchCache | None = None,
        max_repos_to_analyze: int = 3,
        max_code_searches: int = 3,
    ) -> None:
        self._llm = llm
        self._github = GitHubClient(token=github_token)
        self._cache = cache or SearchCache()
        self._max_repos = max_repos_to_analyze
        self._max_code_searches = max_code_searches

    def search(
        self,
        topic: str,
        domain: DomainProfile,
        specific_needs: list[str] | None = None,
    ) -> CodeSearchResult:
        """Execute a complete code search for a research topic.

        Flow:
        1. Check cache
        2. Generate search queries
        3. Search GitHub repos + code
        4. Read key files from top repos
        5. Extract patterns
        6. Cache results

        Parameters
        ----------
        topic : str
            Research topic.
        domain : DomainProfile
            Detected domain profile.
        specific_needs : list[str], optional
            Specific library/API needs.

        Returns
        -------
        CodeSearchResult
        """
        logger.info("Code search started for: %.60s (domain=%s)", topic, domain.domain_id)

        # 1. Check cache
        cached = self._cache.get(domain.domain_id, topic)
        if cached:
            logger.info("Using cached code search results")
            return CodeSearchResult.from_cache_dict(cached)

        # 2. Generate search queries
        queries = generate_search_queries(
            topic=topic,
            domain_name=domain.display_name,
            core_libraries=domain.core_libraries,
            specific_needs=specific_needs,
            llm=self._llm,
        )

        # Add domain-specific search terms from profile
        if domain.github_search_terms:
            for term in domain.github_search_terms[:2]:
                if term not in queries:
                    queries.append(term)

        result = CodeSearchResult(queries_used=queries)

        # 3. Search GitHub repos (use first query)
        if queries:
            try:
                repos = self._github.search_repos(queries[0], max_results=10)
                # Filter: recent, well-starred
                repos = [
                    r for r in repos
                    if r.stars >= 10  # minimum quality threshold
                ]
                result.repos_found = repos[:self._max_repos * 2]
            except Exception:
                logger.warning("Repo search failed, continuing", exc_info=True)

        # 4. Search GitHub code (use remaining queries)
        code_snippets: list[str] = []
        for query in queries[1:self._max_code_searches + 1]:
            try:
                snippets = self._github.search_code(query, max_results=5)
                result.snippets_found.extend(snippets)
            except Exception:
                logger.warning("Code search failed for query: %s", query)

        # 5. Read key files from top repos
        for repo in result.repos_found[:self._max_repos]:
            try:
                analysis = self._analyze_repo(repo)
                if analysis:
                    result.repo_analyses.append(analysis)
                    # Collect code snippets
                    for content in analysis.key_files.values():
                        if content:
                            code_snippets.append(content)
            except Exception:
                logger.warning("Failed to analyze repo: %s", repo.full_name)

        # Also fetch content for code search results
        for snippet in result.snippets_found[:5]:
            try:
                content = self._github.get_file_content(
                    snippet.repo_full_name,
                    snippet.file_path,
                )
                if content:
                    snippet.content = content
                    code_snippets.append(content)
            except Exception:
                pass

        # 6. Extract patterns
        if code_snippets:
            result.patterns = extract_patterns(
                code_snippets=code_snippets,
                topic=topic,
                domain_name=domain.display_name,
                llm=self._llm,
            )

        result.github_requests = self._github.request_count

        # 7. Cache results
        if result.patterns.has_content:
            self._cache.put(domain.domain_id, topic, result.to_cache_dict())

        logger.info(
            "Code search complete: %d repos, %d snippets, %d patterns, %d API calls",
            len(result.repos_found),
            len(result.snippets_found),
            len(result.patterns.api_patterns),
            result.github_requests,
        )

        return result

    def _analyze_repo(self, repo: RepoInfo) -> RepoAnalysis | None:
        """Analyze a repository by reading key files."""
        analysis = RepoAnalysis(repo=repo)

        # Get README
        readme = self._github.get_readme(repo.full_name)
        if readme:
            analysis.readme = readme[:3000]  # truncate

        # Get file tree
        file_tree = self._github.get_repo_tree(
            repo.full_name,
            repo.default_branch,
        )
        analysis.file_tree = file_tree

        # Identify and read key files
        key_patterns = [
            "main.py", "run.py", "train.py", "experiment.py",
            "requirements.txt", "setup.py", "pyproject.toml",
        ]
        for pattern in key_patterns:
            matches = [f for f in file_tree if f.endswith(pattern)]
            for match in matches[:1]:  # first match only
                content = self._github.get_file_content(
                    repo.full_name, match, max_size_kb=50,
                )
                if content:
                    analysis.key_files[match] = content

        # Parse requirements
        req_content = analysis.key_files.get("requirements.txt", "")
        if req_content:
            analysis.requirements = [
                line.strip().split("==")[0].split(">=")[0]
                for line in req_content.splitlines()
                if line.strip() and not line.startswith("#")
            ]

        return analysis


================================================
FILE: researchclaw/agents/code_searcher/cache.py
================================================
"""Disk-based cache for code search results.

Caches search results by domain + topic hash with a configurable TTL
(default 30 days). This avoids redundant GitHub API calls for similar
topics within the same domain.
"""

from __future__ import annotations

import hashlib
import json
import logging
import time
from dataclasses import asdict
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

_DEFAULT_CACHE_DIR = Path(__file__).parent.parent.parent / "data" / "code_search_cache"
_DEFAULT_TTL_DAYS = 30


class SearchCache:
    """Disk-based cache for code search results.

    Cache structure::

        code_search_cache/
          {domain_id}/
            {topic_hash}.json
    """

    def __init__(
        self,
        cache_dir: Path | None = None,
        ttl_days: int = _DEFAULT_TTL_DAYS,
    ) -> None:
        self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR
        self._ttl_sec = ttl_days * 86400

    def get(self, domain_id: str, topic: str) -> dict[str, Any] | None:
        """Get cached result if it exists and is not expired."""
        cache_path = self._cache_path(domain_id, topic)
        if not cache_path.exists():
            return None

        try:
            data = json.loads(cache_path.read_text(encoding="utf-8"))
            timestamp = data.get("_cached_at", 0)
            if time.time() - timestamp > self._ttl_sec:
                logger.debug("Cache expired for %s/%s", domain_id, topic[:40])
                cache_path.unlink(missing_ok=True)
                return None
            logger.info("Cache hit for %s/%s", domain_id, topic[:40])
            return data
        except Exception:
            logger.warning("Failed to read cache", exc_info=True)
            return None

    def put(self, domain_id: str, topic: str, data: dict[str, Any]) -> None:
        """Store a result in the cache."""
        cache_path = self._cache_path(domain_id, topic)
        cache_path.parent.mkdir(parents=True, exist_ok=True)

        data["_cached_at"] = time.time()
        data["_domain_id"] = domain_id
        data["_topic_hash"] = self._topic_hash(topic)

        try:
            cache_path.write_text(
                json.dumps(data, indent=2, default=str),
                encoding="utf-8",
            )
            logger.debug("Cached result for %s/%s", domain_id, topic[:40])
        except Exception:
            logger.warning("Failed to write cache", exc_info=True)

    def clear(self, domain_id: str | None = None) -> int:
        """Clear cache. Returns number of entries removed."""
        count = 0
        if domain_id:
            domain_dir = self._cache_dir / domain_id
            if domain_dir.is_dir():
                for f in domain_dir.glob("*.json"):
                    f.unlink()
                    count += 1
        else:
            if self._cache_dir.is_dir():
                for f in self._cache_dir.rglob("*.json"):
                    f.unlink()
                    count += 1
        return count

    def stats(self) -> dict[str, int]:
        """Return cache statistics."""
        total = 0
        expired = 0
        by_domain: dict[str, int] = {}

        if not self._cache_dir.is_dir():
            return {"total": 0, "expired": 0}

        for f in self._cache_dir.rglob("*.json"):
            total += 1
            domain = f.parent.name
            by_domain[domain] = by_domain.get(domain, 0) + 1
            try:
                data = json.loads(f.read_text(encoding="utf-8"))
                if time.time() - data.get("_cached_at", 0) > self._ttl_sec:
                    expired += 1
            except Exception:
                pass

        return {"total": total, "expired": expired, **by_domain}

    def _cache_path(self, domain_id: str, topic: str) -> Path:
        return self._cache_dir / domain_id / f"{self._topic_hash(topic)}.json"

    @staticmethod
    def _topic_hash(topic: str) -> str:
        return hashlib.sha256(topic.lower().strip().encode()).hexdigest()[:16]


================================================
FILE: researchclaw/agents/code_searcher/github_client.py
================================================
"""GitHub REST API client for code and repository search.

Handles rate limiting, authentication, and response parsing for:
  - Repository search (``/search/repositories``)
  - Code search (``/search/code``)
  - File content retrieval (``/repos/{owner}/{repo}/contents/{path}``)
  - README retrieval

Rate limits:
  - Authenticated: 30 req/min for search, 5000 req/hr for core
  - Code search: 10 req/min
  - Unauthenticated: 10 req/min for search
"""

from __future__ import annotations

import logging
import os
import time
from dataclasses import dataclass, field
from typing import Any
from urllib.parse import quote

logger = logging.getLogger(__name__)

_GITHUB_API = "https://api.github.com"


@dataclass
class RepoInfo:
    """Summary of a GitHub repository."""
    full_name: str  # "owner/repo"
    description: str = ""
    stars: int = 0
    language: str = ""
    updated_at: str = ""
    html_url: str = ""
    default_branch: str = "main"
    topics: list[str] = field(default_factory=list)


@dataclass
class CodeSnippet:
    """A code snippet found via GitHub code search."""
    repo_full_name: str
    file_path: str
    file_url: str = ""
    content: str = ""  # populated after fetching
    score: float = 0.0


@dataclass
class RepoAnalysis:
    """Analysis of a repository's structure and content."""
    repo: RepoInfo
    readme: str = ""
    requirements: list[str] = field(default_factory=list)
    key_files: dict[str, str] = field(default_factory=dict)  # path -> content
    file_tree: list[str] = field(default_factory=list)


class GitHubClient:
    """GitHub REST API client with rate limiting and caching.

    Uses ``GITHUB_TOKEN`` env var for authentication (strongly recommended).
    Falls back to unauthenticated access (much lower rate limits).
    """

    def __init__(self, token: str | None = None) -> None:
        self._token = token or os.environ.get("GITHUB_TOKEN", "")
        self._last_search_time: float = 0
        self._search_interval: float = 6.0  # 10 req/min → 6s between requests
        self._request_count: int = 0

    def _headers(self) -> dict[str, str]:
        headers = {
            "Accept": "application/vnd.github+json",
            "X-GitHub-Api-Version": "2022-11-28",
        }
        if self._token:
            headers["Authorization"] = f"Bearer {self._token}"
        return headers

    def _rate_limit_wait(self) -> None:
        """Enforce rate limiting between search requests."""
        elapsed = time.time() - self._last_search_time
        if elapsed < self._search_interval:
            wait = self._search_interval - elapsed
            logger.debug("Rate limit: waiting %.1fs", wait)
            time.sleep(wait)
        self._last_search_time = time.time()

    def _get(self, url: str, params: dict[str, str] | None = None) -> dict[str, Any] | None:
        """Make a GET request to the GitHub API."""
        import urllib.request
        import urllib.error
        import json

        if params:
            query_str = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
            url = f"{url}?{query_str}"

        req = urllib.request.Request(url, headers=self._headers())
        self._request_count += 1

        try:
            with urllib.request.urlopen(req, timeout=15) as resp:
                return json.loads(resp.read().decode("utf-8"))
        except urllib.error.HTTPError as e:
            if e.code == 403:
                logger.warning("GitHub API rate limited (403). Skipping.")
                return None
            if e.code == 422:
                logger.warning("GitHub API validation error (422): %s", url)
                return None
            logger.warning("GitHub API error %d: %s", e.code, url)
            return None
        except Exception:
            logger.warning("GitHub API request failed: %s", url, exc_info=True)
            return None

    def search_repos(
        self,
        query: str,
        language: str = "Python",
        sort: str = "stars",
        max_results: int = 10,
    ) -> list[RepoInfo]:
        """Search for repositories matching a query.

        Parameters
        ----------
        query : str
            Search query (e.g., "PDE solver finite element").
        language : str
            Filter by programming language.
        sort : str
            Sort order: "stars", "updated", "best-match".
        max_results : int
            Maximum number of results to return.

        Returns
        -------
        list[RepoInfo]
        """
        self._rate_limit_wait()

        search_q = f"{query} language:{language}"
        params = {
            "q": search_q,
            "sort": sort,
            "order": "desc",
            "per_page": str(min(max_results, 30)),
        }

        data = self._get(f"{_GITHUB_API}/search/repositories", params)
        if data is None:
            return []

        repos: list[RepoInfo] = []
        for item in data.get("items", [])[:max_results]:
            repos.append(RepoInfo(
                full_name=item.get("full_name", ""),
                description=item.get("description", "") or "",
                stars=item.get("stargazers_count", 0),
                language=item.get("language", "") or "",
                updated_at=item.get("updated_at", ""),
                html_url=item.get("html_url", ""),
                default_branch=item.get("default_branch", "main"),
                topics=item.get("topics", []),
            ))

        logger.info("Found %d repos for query: %.60s", len(repos), query)
        return repos

    def search_code(
        self,
        query: str,
        language: str = "Python",
        max_results: int = 10,
    ) -> list[CodeSnippet]:
        """Search for code snippets matching a query.

        Note: Code search has stricter rate limits (10 req/min).

        Parameters
        ----------
        query : str
            Search query (e.g., "from pyscf import gto scf").
        language : str
            Filter by programming language.
        max_results : int
            Maximum results.

        Returns
        -------
        list[CodeSnippet]
        """
        self._rate_limit_wait()

        search_q = f"{query} language:{language}"
        params = {
            "q": search_q,
            "per_page": str(min(max_results, 30)),
        }

        data = self._get(f"{_GITHUB_API}/search/code", params)
        if data is None:
            return []

        snippets: list[CodeSnippet] = []
        for item in data.get("items", [])[:max_results]:
            repo = item.get("repository", {})
            snippets.append(CodeSnippet(
                repo_full_name=repo.get("full_name", ""),
                file_path=item.get("path", ""),
                file_url=item.get("html_url", ""),
                score=item.get("score", 0.0),
            ))

        logger.info("Found %d code snippets for query: %.60s", len(snippets), query)
        return snippets

    def get_file_content(
        self,
        repo_full_name: str,
        path: str,
        max_size_kb: int = 100,
    ) -> str | None:
        """Get the content of a file from a repository.

        Parameters
        ----------
        repo_full_name : str
            Repository in "owner/repo" format.
        path : str
            File path within the repository.
        max_size_kb : int
            Skip files larger than this.

        Returns
        -------
        str or None
            File content, or None if not found/too large.
        """
        import base64

        url = f"{_GITHUB_API}/repos/{repo_full_name}/contents/{quote(path, safe='/')}"
        data = self._get(url)
        if data is None:
            return None

        size = data.get("size", 0)
        if size > max_size_kb * 1024:
            logger.debug("File too large (%d KB): %s/%s", size // 1024, repo_full_name, path)
            return None

        content = data.get("content", "")
        encoding = data.get("encoding", "")

        if encoding == "base64":
            try:
                return base64.b64decode(content).decode("utf-8", errors="replace")
            except Exception:
                return None
        return content

    def get_readme(self, repo_full_name: str) -> str | None:
        """Get the README content of a repository."""
        import base64

        url = f"{_GITHUB_API}/repos/{repo_full_name}/readme"
        data = self._get(url)
        if data is None:
            return None

        content = data.get("content", "")
        encoding = data.get("encoding", "")
        if encoding == "base64":
            try:
                return base64.b64decode(content).decode("utf-8", errors="replace")
            except Exception:
                return None
        return content

    def get_repo_tree(
        self,
        repo_full_name: str,
        branch: str = "main",
    ) -> list[str]:
        """Get the file tree of a repository (flat list of paths)."""
        url = f"{_GITHUB_API}/repos/{repo_full_name}/git/trees/{branch}"
        params = {"recursive": "1"}
        data = self._get(url, params)
        if data is None:
            return []

        tree = data.get("tree", [])
        return [item["path"] for item in tree if item.get("type") == "blob"]

    @property
    def request_count(self) -> int:
        return self._request_count

    @property
    def has_token(self) -> bool:
        return bool(self._token)


================================================
FILE: researchclaw/agents/code_searcher/pattern_extractor.py
================================================
"""Extract reusable code patterns from GitHub search results.

Uses LLM to analyze reference code and extract:
  - API call patterns (how to use a specific library)
  - File organization patterns (project structure)
  - Data processing patterns (data loading / preprocessing)
  - Evaluation patterns (how to compute and report metrics)
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class CodePatterns:
    """Extracted patterns from reference code."""
    api_patterns: list[str] = field(default_factory=list)
    file_structure: dict[str, str] = field(default_factory=dict)
    data_patterns: list[str] = field(default_factory=list)
    evaluation_patterns: list[str] = field(default_factory=list)
    library_versions: dict[str, str] = field(default_factory=dict)
    raw_snippets: list[str] = field(default_factory=list)

    def to_prompt_context(self) -> str:
        """Format patterns as context for code generation prompts."""
        parts: list[str] = []

        if self.api_patterns:
            parts.append("## Reference API Usage Patterns")
            for i, pattern in enumerate(self.api_patterns[:5], 1):
                parts.append(f"### Pattern {i}")
                parts.append(f"```python\n{pattern}\n```")

        if self.file_structure:
            parts.append("\n## Reference Project Structure")
            for fname, desc in self.file_structure.items():
                parts.append(f"- `{fname}`: {desc}")

        if self.evaluation_patterns:
            parts.append("\n## Reference Evaluation Patterns")
            for pattern in self.evaluation_patterns[:3]:
                parts.append(f"```python\n{pattern}\n```")

        return "\n".join(parts)

    @property
    def has_content(self) -> bool:
        return bool(self.api_patterns or self.file_structure or self.evaluation_patterns)


_EXTRACT_PROMPT = """\
You are analyzing reference code to extract reusable patterns for a research project.

Research topic: {topic}
Domain: {domain_name}

Here are code snippets from relevant GitHub repositories:

{code_snippets}

Extract the following patterns as JSON:

{{
    "api_patterns": [
        "# Short, self-contained code snippet showing key API usage",
        "# Each should be 3-10 lines showing one specific API call pattern"
    ],
    "file_structure": {{
        "filename.py": "what this file does"
    }},
    "evaluation_patterns": [
        "# How results are computed and reported"
    ],
    "library_versions": {{
        "library_name": "recommended version"
    }}
}}

Focus on:
1. How the core libraries are imported and used
2. Common data loading / preprocessing patterns
3. How experiments are structured
4. How results are computed and reported

Return ONLY valid JSON."""


def extract_patterns(
    code_snippets: list[str],
    topic: str,
    domain_name: str,
    llm: Any | None = None,
) -> CodePatterns:
    """Extract code patterns from reference snippets.

    Parameters
    ----------
    code_snippets : list[str]
        Code content from GitHub repos.
    topic : str
        Research topic for context.
    domain_name : str
        Domain name for context.
    llm : LLMClient, optional
        LLM for pattern extraction. Falls back to heuristic if not provided.

    Returns
    -------
    CodePatterns
    """
    if not code_snippets:
        return CodePatterns()

    if llm is not None:
        return _llm_extract(code_snippets, topic, domain_name, llm)

    return _heuristic_extract(code_snippets)


def _llm_extract(
    snippets: list[str],
    topic: str,
    domain_name: str,
    llm: Any,
) -> CodePatterns:
    """Extract patterns using LLM analysis."""
    try:
        # Truncate snippets to fit context
        combined = ""
        for i, snippet in enumerate(snippets[:5]):
            truncated = snippet[:2000] if len(snippet) > 2000 else snippet
            combined += f"\n--- Snippet {i+1} ---\n{truncated}\n"

        prompt = _EXTRACT_PROMPT.format(
            topic=topic,
            domain_name=domain_name,
            code_snippets=combined,
        )

        if hasattr(llm, "chat"):
            import asyncio
            try:
                loop = asyncio.get_running_loop()
            except RuntimeError:
                loop = None
            if loop and loop.is_running():
                return _heuristic_extract(snippets)
            resp = llm.chat(
                [{"role": "user", "content": prompt}],
                system="You extract code patterns as JSON.",
                max_tokens=1500,
            )
        else:
            return _heuristic_extract(snippets)

        content = resp.content if hasattr(resp, "content") else str(resp)

        # Parse JSON from response
        json_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", content, re.DOTALL)
        if json_match:
            data = json.loads(json_match.group())
            return CodePatterns(
                api_patterns=data.get("api_patterns", []),
                file_structure=data.get("file_structure", {}),
                evaluation_patterns=data.get("evaluation_patterns", []),
                library_versions=data.get("library_versions", {}),
                raw_snippets=snippets[:5],
            )

    except Exception:
        logger.warning("LLM pattern extraction failed", exc_info=True)

    return _heuristic_extract(snippets)


def _heuristic_extract(snippets: list[str]) -> CodePatterns:
    """Extract patterns using regex heuristics (no LLM needed)."""
    patterns = CodePatterns(raw_snippets=snippets[:5])

    for snippet in snippets:
        # Extract import statements as API patterns
        imports = re.findall(r"^(?:from|import)\s+.+$", snippet, re.MULTILINE)
        for imp in imports[:10]:
            if imp not in patterns.api_patterns:
                patterns.api_patterns.append(imp)

        # Extract function/class definitions for structure hints
        defs = re.findall(r"^(?:def|class)\s+(\w+)", snippet, re.MULTILINE)
        for d in defs[:5]:
            if d not in patterns.file_structure:
                patterns.file_structure[d] = "detected function/class"

    # Deduplicate
    patterns.api_patterns = list(dict.fromkeys(patterns.api_patterns))[:10]

    return patterns


================================================
FILE: researchclaw/agents/code_searcher/query_gen.py
================================================
"""LLM-based search query generation for code search.

Given a research topic and domain, generates targeted search queries
for GitHub repository and code search.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any

logger = logging.getLogger(__name__)

_QUERY_GEN_PROMPT = """\
You are generating GitHub search queries to find reference code for a research experiment.

Research topic: {topic}
Domain: {domain_name}
Core libraries: {libraries}
Specific needs: {needs}

Generate 3-5 search queries that will help find:
1. Example implementations using the domain's core libraries
2. Similar research projects or experiments
3. Specific API usage patterns needed for this experiment

Rules:
- Each query should be 3-8 words (GitHub search works best with short queries)
- Include library names when searching for API usage
- Include domain-specific terms
- Focus on FINDING CODE, not documentation

Respond as a JSON array of strings. Example:
["pyscf DFT hartree fock example", "molecular energy calculation python"]

Queries:"""


def generate_search_queries(
    topic: str,
    domain_name: str,
    core_libraries: list[str],
    specific_needs: list[str] | None = None,
    llm: Any | None = None,
) -> list[str]:
    """Generate search queries for GitHub code search.

    If no LLM is provided, generates queries from topic keywords and
    library names using heuristic rules.

    Parameters
    ----------
    topic : str
        Research topic.
    domain_name : str
        Domain display name.
    core_libraries : list[str]
        Domain's core libraries.
    specific_needs : list[str], optional
        Specific API/library needs.
    llm : LLMClient, optional
        LLM for query generation.

    Returns
    -------
    list[str]
        3-5 search queries.
    """
    if llm is not None:
        return _llm_generate(topic, domain_name, core_libraries, specific_needs or [], llm)

    return _heuristic_generate(topic, domain_name, core_libraries, specific_needs or [])


def _heuristic_generate(
    topic: str,
    domain_name: str,
    libraries: list[str],
    needs: list[str],
) -> list[str]:
    """Generate queries without LLM using keyword extraction."""
    queries: list[str] = []

    # Clean topic: extract key phrases
    topic_words = _extract_key_phrases(topic)

    # Query 1: Topic + main library
    if libraries:
        queries.append(f"{topic_words} {libraries[0]}")

    # Query 2: Domain + "python example"
    queries.append(f"{domain_name.lower()} python example")

    # Query 3: Specific library usage
    for lib in libraries[:2]:
        queries.append(f"{lib} example tutorial python")

    # Query 4: Specific needs
    for need in needs[:2]:
        queries.append(f"{need} python")

    # Deduplicate and limit
    seen: set[str] = set()
    unique: list[str] = []
    for q in queries:
        q_norm = q.lower().strip()
        if q_norm not in seen:
            seen.add(q_norm)
            unique.append(q)

    return unique[:5]


def _llm_generate(
    topic: str,
    domain_name: str,
    libraries: list[str],
    needs: list[str],
    llm: Any,
) -> list[str]:
    """Generate queries using LLM."""
    try:
        prompt = _QUERY_GEN_PROMPT.format(
            topic=topic,
            domain_name=domain_name,
            libraries=", ".join(libraries),
            needs=", ".join(needs) if needs else "general usage",
        )

        # Synchronous LLM call — LLMClient.chat() is sync and takes
        # (messages, *, system=, max_tokens=) signature.
        if hasattr(llm, "chat"):
            resp = llm.chat(
                [{"role": "user", "content": prompt}],
                system="You generate concise GitHub search queries.",
                max_tokens=200,
            )
        else:
            return _heuristic_generate(topic, domain_name, libraries, needs)

        content = resp.content if hasattr(resp, "content") else str(resp)

        # Parse JSON array from response
        json_match = re.search(r"\[.*\]", content, re.DOTALL)
        if json_match:
            queries = json.loads(json_match.group())
            if isinstance(queries, list) and all(isinstance(q, str) for q in queries):
                return queries[:5]

        logger.warning("Failed to parse LLM query response, using heuristic")
        return _heuristic_generate(topic, domain_name, libraries, needs)

    except Exception:
        logger.warning("LLM query generation failed", exc_info=True)
        return _heuristic_generate(topic, domain_name, libraries, needs)


def _extract_key_phrases(text: str, max_words: int = 5) -> str:
    """Extract key phrases from a research topic."""
    # Remove common filler words
    stop_words = {
        "a", "an", "the", "of", "for", "in", "on", "with", "and", "or",
        "to", "by", "is", "are", "using", "based", "via", "through",
        "novel", "new", "improved", "efficient", "towards",
    }
    words = text.lower().split()
    key_words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(key_words[:max_words])


================================================
FILE: researchclaw/agents/figure_agent/__init__.py
================================================
"""FigureAgent — multi-agent intelligent chart generation system.

Architecture
------------
1. **Planner** — analyzes experiment results and determines which charts
   to generate, their types, layouts, and captions.
2. **CodeGen** — generates Python matplotlib plotting scripts using
   academic styling (SciencePlots, 300 DPI, colorblind-safe palettes).
3. **Renderer** — executes plotting scripts and verifies output files.
4. **Critic** — tri-modal review: numerical accuracy, text correctness,
   and visual quality assessment.
5. **Integrator** — determines figure placement in the paper and
   generates markdown references with captions.

The ``FigureOrchestrator`` coordinates all agents and produces a
``FigurePlan`` consumed by downstream pipeline stages (paper draft,
paper export).
"""

from researchclaw.agents.figure_agent.orchestrator import (
    FigureOrchestrator,
    FigurePlan,
)

__all__ = ["FigureOrchestrator", "FigurePlan"]


================================================
FILE: researchclaw/agents/figure_agent/codegen.py
================================================
"""CodeGen Agent — generates visualization code for each figure.

Takes the Planner's figure specifications and experiment data, then
generates either:
  - Standalone Python scripts (Matplotlib/Seaborn) — run by Renderer
  - LaTeX code (TikZ/PGFPlots) — embedded directly in the paper

Architecture follows Visual ChatGPT (Wu et al., 2023): the LLM acts
as a *controller* calling deterministic render tools instead of
generating pixels directly.
"""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult
from researchclaw.agents.figure_agent.style_config import get_style_preamble
from researchclaw.utils.sanitize import sanitize_figure_id
from researchclaw.utils.thinking_tags import strip_thinking_tags

logger = logging.getLogger(__name__)


def _esc(s: str) -> str:
    """Escape curly braces in user-provided strings for str.format()."""
    return s.replace("{", "{{").replace("}", "}}")


# ---------------------------------------------------------------------------
# Degenerate data detection
# ---------------------------------------------------------------------------

def _is_degenerate_data(values: list[float]) -> bool:
    """Return True if data values are too degenerate to produce a useful chart.

    Rejects: empty lists, all-zero, all-identical, or single-value data.
    """
    if not values or len(values) < 1:
        return True
    if all(v == 0 for v in values):
        return True
    if len(values) >= 2 and len(set(round(v, 6) for v in values)) <= 1:
        return True
    return False


# ---------------------------------------------------------------------------
# Metric name humanization
# ---------------------------------------------------------------------------

_METRIC_DISPLAY_NAMES: dict[str, str] = {
    "primary_metric": "Performance",
    "accuracy": "Accuracy (%)",
    "loss": "Loss",
    "f1_score": "F1 Score",
    "precision": "Precision",
    "recall": "Recall",
    "reward": "Reward",
    "return": "Return",
    "mse": "MSE",
    "mae": "MAE",
    "rmse": "RMSE",
    "bleu": "BLEU",
    "rouge": "ROUGE",
    "perplexity": "Perplexity",
    "auc": "AUC",
}


def _humanize_label(raw: str) -> str:
    """Convert raw metric names like 'primary_metric' to human-readable labels."""
    if not raw:
        return ""
    low = raw.lower().strip()
    if low in _METRIC_DISPLAY_NAMES:
        return _METRIC_DISPLAY_NAMES[low]
    # Convert snake_case to Title Case
    return raw.replace("_", " ").title()


# ---------------------------------------------------------------------------
# Built-in chart templates
# ---------------------------------------------------------------------------

_TEMPLATE_BAR_COMPARISON = '''
{style_preamble}

# Data
conditions = {conditions}
values = {values}
ci_low = {ci_low}
ci_high = {ci_high}

# Plot
fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)
x = np.arange(len(conditions))
bar_colors = [COLORS[i % len(COLORS)] for i in range(len(conditions))]

yerr_lo = [max(0, v - lo) for v, lo in zip(values, ci_low)]
yerr_hi = [max(0, hi - v) for v, hi in zip(values, ci_high)]

bars = ax.bar(x, values, color=bar_colors, alpha=0.85, edgecolor="white", linewidth=0.5)
ax.errorbar(x, values, yerr=[yerr_lo, yerr_hi],
            fmt="none", ecolor="#333", capsize=4, capthick=1.2, linewidth=1.2)

# Value labels
offset = max(yerr_hi) * 0.08 if yerr_hi and max(yerr_hi) > 0 else max(values) * 0.02
for i, v in enumerate(values):
    ax.text(i, v + offset, f"{{v:.4f}}", ha="center", va="bottom", fontweight="bold")

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
ax.set_xticks(x)
ax.set_xticklabels([c.replace("_", " ") for c in conditions], rotation=25, ha="right")
ax.grid(True, axis="y", alpha=0.3)
ax.set_axisbelow(True)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATE_GROUPED_BAR = '''
{style_preamble}

# Data: conditions x metrics
conditions = {conditions}
metric_names = {metric_names}
# data_matrix[i][j] = value for condition i, metric j
data_matrix = {data_matrix}

# Plot
n_groups = len(conditions)
n_bars = len(metric_names)
fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)
x = np.arange(n_groups)
bar_width = 0.8 / n_bars

for j, metric in enumerate(metric_names):
    offset = (j - n_bars / 2 + 0.5) * bar_width
    vals = [data_matrix[i][j] for i in range(n_groups)]
    ax.bar(x + offset, vals, bar_width, label=metric.replace("_", " "),
           color=COLORS[j % len(COLORS)], alpha=0.85, edgecolor="white", linewidth=0.5)

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
ax.set_xticks(x)
ax.set_xticklabels([c.replace("_", " ") for c in conditions], rotation=25, ha="right")
ax.legend(loc="upper left", bbox_to_anchor=(0, 1), framealpha=0.9, edgecolor="gray")
ax.grid(True, axis="y", alpha=0.3)
ax.set_axisbelow(True)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATE_TRAINING_CURVE = '''
{style_preamble}

# Data: each series is (label, epochs, values, [optional std])
series_data = {series_data}

fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)

for idx, series in enumerate(series_data):
    label = series["label"]
    epochs = series["epochs"]
    values = series["values"]
    color = COLORS[idx % len(COLORS)]
    ls = LINE_STYLES[idx % len(LINE_STYLES)]
    marker = MARKERS[idx % len(MARKERS)]

    ax.plot(epochs, values, linestyle=ls, color=color, linewidth=1.5,
            marker=marker, markersize=4, markevery=max(1, len(epochs)//10),
            label=label.replace("_", " "))

    if "std" in series and series["std"]:
        std = series["std"]
        lower = [v - s for v, s in zip(values, std)]
        upper = [v + s for v, s in zip(values, std)]
        ax.fill_between(epochs, lower, upper, alpha=0.15, color=color)

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
ax.legend(loc="best", framealpha=0.9, edgecolor="gray")
ax.grid(True, alpha=0.3)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATE_HEATMAP = '''
{style_preamble}

# Data
row_labels = {row_labels}
col_labels = {col_labels}
data = np.array({data_matrix})

fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)
im = ax.imshow(data, cmap="cividis", aspect="auto")

ax.set_xticks(np.arange(len(col_labels)))
ax.set_yticks(np.arange(len(row_labels)))
ax.set_xticklabels(col_labels, rotation=45, ha="right")
ax.set_yticklabels(row_labels)

# Annotate cells
for i in range(len(row_labels)):
    for j in range(len(col_labels)):
        val = data[i, j]
        color = "white" if val > (data.max() + data.min()) / 2 else "black"
        ax.text(j, i, f"{{val:.3f}}", ha="center", va="center", color=color)

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
fig.colorbar(im, ax=ax, shrink=0.8)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATE_LINE_MULTI = '''
{style_preamble}

# Data: list of series dicts with label, x, y, [std]
series_data = {series_data}

fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)

for idx, series in enumerate(series_data):
    label = series["label"]
    x = series["x"]
    y = series["y"]
    color = COLORS[idx % len(COLORS)]
    ls = LINE_STYLES[idx % len(LINE_STYLES)]
    marker = MARKERS[idx % len(MARKERS)]

    ax.plot(x, y, linestyle=ls, color=color, linewidth=1.5,
            marker=marker, markersize=4, markevery=max(1, len(x)//8),
            label=label.replace("_", " "))

    if "std" in series and series["std"]:
        std = series["std"]
        lower = [v - s for v, s in zip(y, std)]
        upper = [v + s for v, s in zip(y, std)]
        ax.fill_between(x, lower, upper, alpha=0.15, color=color)

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
ax.legend(loc="best", framealpha=0.9, edgecolor="gray")
ax.grid(True, alpha=0.3)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATE_SCATTER = '''
{style_preamble}

# Data: list of groups with label, x, y
groups = {groups}

fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True)

for idx, group in enumerate(groups):
    label = group["label"]
    x = group["x"]
    y = group["y"]
    color = COLORS[idx % len(COLORS)]
    marker = MARKERS[idx % len(MARKERS)]
    ax.scatter(x, y, c=color, marker=marker, s=40, alpha=0.7, label=label.replace("_", " "))

ax.set_xlabel("{x_label}")
ax.set_ylabel("{y_label}")
ax.set_title("{title}")
ax.legend(loc="best", framealpha=0.9, edgecolor="gray")
ax.grid(True, alpha=0.3)
fig.savefig("{output_path}")
plt.close(fig)
print(f"Saved: {output_path}")
'''

_TEMPLATES: dict[str, str] = {
    "bar_comparison": _TEMPLATE_BAR_COMPARISON,
    "ablation_grouped": _TEMPLATE_BAR_COMPARISON,  # Same template, different data
    "grouped_bar": _TEMPLATE_GROUPED_BAR,
    "training_curve": _TEMPLATE_TRAINING_CURVE,
    "loss_curve": _TEMPLATE_TRAINING_CURVE,
    "heatmap": _TEMPLATE_HEATMAP,
    "confusion_matrix": _TEMPLATE_HEATMAP,
    "line_multi": _TEMPLATE_LINE_MULTI,
    "scatter_plot": _TEMPLATE_SCATTER,
}

# ---------------------------------------------------------------------------
# LaTeX / PGFPlots templates — for direct LaTeX embedding
# ---------------------------------------------------------------------------

_LATEX_TEMPLATE_BAR_COMPARISON = r'''
\begin{{figure}}[htbp]
\centering
\begin{{tikzpicture}}
\begin{{axis}}[
    ybar,
    bar width=15pt,
    width={width}cm,
    height={height}cm,
    xlabel={{{x_label}}},
    ylabel={{{y_label}}},
    title={{{title}}},
    symbolic x coords={{{x_coords}}},
    xtick=data,
    x tick label style={{rotate=25, anchor=east, font=\small}},
    ymin=0,
    nodes near coords,
    nodes near coords align={{vertical}},
    every node near coord/.append style={{font=\tiny}},
    grid=major,
    grid style={{dashed, gray!30}},
]
\addplot[fill=blue!60, draw=blue!80] coordinates {{{coords}}};
\end{{axis}}
\end{{tikzpicture}}
\caption{{{caption}}}
\label{{fig:{figure_id}}}
\end{{figure}}
'''

_LATEX_TEMPLATE_LINE = r'''
\begin{{figure}}[htbp]
\centering
\begin{{tikzpicture}}
\begin{{axis}}[
    width={width}cm,
    height={height}cm,
    xlabel={{{x_label}}},
    ylabel={{{y_label}}},
    title={{{title}}},
    legend pos=north west,
    grid=major,
    grid style={{dashed, gray!30}},
    cycle list name=color list,
]
{plot_commands}
\end{{axis}}
\end{{tikzpicture}}
\caption{{{caption}}}
\label{{fig:{figure_id}}}
\end{{figure}}
'''

_LATEX_TEMPLATE_HEATMAP = r'''
\begin{{figure}}[htbp]
\centering
\begin{{tikzpicture}}
\begin{{axis}}[
    colormap/viridis,
    colorbar,
    width={width}cm,
    height={height}cm,
    xlabel={{{x_label}}},
    ylabel={{{y_label}}},
    title={{{title}}},
    point meta min={meta_min},
    point meta max={meta_max},
    xtick={{{xtick}}},
    ytick={{{ytick}}},
    xticklabels={{{xticklabels}}},
    yticklabels={{{yticklabels}}},
    x tick label style={{rotate=45, anchor=east, font=\small}},
]
\addplot[matrix plot*, mesh/cols={cols}, mesh/rows={rows},
    point meta=explicit] coordinates {{
{matrix_coords}
}};
\end{{axis}}
\end{{tikzpicture}}
\caption{{{caption}}}
\label{{fig:{figure_id}}}
\end{{figure}}
'''

_LATEX_TEMPLATES: dict[str, str] = {
    "bar_comparison": _LATEX_TEMPLATE_BAR_COMPARISON,
    "ablation_grouped": _LATEX_TEMPLATE_BAR_COMPARISON,
    "training_curve": _LATEX_TEMPLATE_LINE,
    "loss_curve": _LATEX_TEMPLATE_LINE,
    "line_multi": _LATEX_TEMPLATE_LINE,
    "heatmap": _LATEX_TEMPLATE_HEATMAP,
    "confusion_matrix": _LATEX_TEMPLATE_HEATMAP,
}


class CodeGenAgent(BaseAgent):
    """Generates visualization code (Python or LaTeX) for each planned figure.

    Supports two output formats:
      - ``"python"`` (default): Matplotlib/Seaborn scripts executed by Renderer
      - ``"latex"``: TikZ/PGFPlots code embedded directly in the paper
    """

    name = "figure_codegen"

    def __init__(self, llm: Any, *, output_format: str = "python", use_docker: bool = False) -> None:
        super().__init__(llm)
        self._output_format = output_format  # "python" or "latex"
        self._use_docker = use_docker  # BUG-60: generate Docker paths when True

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Generate plotting scripts for all planned figures.

        Context keys:
            figures (list[dict]): Figure plan from Planner
            experiment_results (dict): Raw experiment data
            condition_summaries (dict): Per-condition aggregated stats
            metrics_summary (dict): Per-metric aggregated stats
            metric_key (str): Primary metric name
            output_dir (str): Directory for output scripts
            critic_feedback (list[dict], optional): Previous Critic feedback
        """
        try:
            figures = context.get("figures", [])
            experiment_results = context.get("experiment_results", {})
            condition_summaries = context.get("condition_summaries", {})
            metrics_summary = context.get("metrics_summary", {})
            metric_key = context.get("metric_key", "primary_metric")
            output_dir = context.get("output_dir", "charts")
            critic_feedback = context.get("critic_feedback", [])

            scripts: list[dict[str, Any]] = []

            for fig_spec in figures:
                # BUG-36: skip non-dict entries (LLM may return strings)
                if not isinstance(fig_spec, dict):
                    self.logger.warning("Skipping non-dict fig_spec: %s", type(fig_spec))
                    continue
                figure_id = fig_spec.get("figure_id", "unknown")
                chart_type = fig_spec.get("chart_type", "bar_comparison")

                # Check for critic feedback on this specific figure
                fig_feedback = None
                for fb in critic_feedback:
                    # BUG-FIX: guard against non-dict entries in feedback
                    if isinstance(fb, dict) and fb.get("figure_id") == figure_id:
                        fig_feedback = fb
                        break

                script = self._generate_script(
                    fig_spec=fig_spec,
                    chart_type=chart_type,
                    condition_summaries=condition_summaries,
                    metrics_summary=metrics_summary,
                    experiment_results=experiment_results,
                    metric_key=metric_key,
                    output_dir=output_dir,
                    critic_feedback=fig_feedback,
                )

                scripts.append({
                    "figure_id": figure_id,
                    "chart_type": chart_type,
                    "script": script,
                    "output_filename": f"{figure_id}.png",
                    "title": fig_spec.get("title", ""),
                    "caption": fig_spec.get("caption", ""),
                    "section": fig_spec.get("section", "results"),
                    "width": fig_spec.get("width", "single_column"),
                })

            return self._make_result(True, data={"scripts": scripts})
        except Exception as exc:
            self.logger.error("CodeGen failed: %s", exc)
            return self._make_result(False, error=str(exc))

    # ------------------------------------------------------------------
    # Script generation
    # ------------------------------------------------------------------

    def _generate_script(
        self,
        *,
        fig_spec: dict[str, Any],
        chart_type: str,
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        experiment_results: dict[str, Any],
        metric_key: str,
        output_dir: str,
        critic_feedback: dict[str, Any] | None,
    ) -> str:
        """Generate a plotting script for a single figure."""
        figure_id = sanitize_figure_id(fig_spec.get("figure_id", "figure"))
        # BUG-20: Use absolute path to avoid CWD-relative savefig errors
        # BUG-60: When running in Docker, use container path directly so
        # renderer doesn't need fragile regex rewriting of host paths.
        if self._use_docker:
            output_path = f"/workspace/output/{figure_id}.png"
        else:
            output_path = str((Path(output_dir) / f"{figure_id}.png").resolve())
        title = fig_spec.get("title", "")
        x_label = fig_spec.get("x_label", "")
        y_label = fig_spec.get("y_label", "")
        width_key = fig_spec.get("width", "single_column")
        # BUG-FIX: LLM may return data_source as a plain string (e.g.
        # "condition_comparison") instead of a dict.  Normalize to dict.
        _raw_ds = fig_spec.get("data_source", {})
        if isinstance(_raw_ds, str):
            data_source = {"type": _raw_ds}
        elif isinstance(_raw_ds, dict):
            data_source = _raw_ds
        else:
            data_source = {}

        from researchclaw.agents.figure_agent.style_config import FIGURE_WIDTH, DEFAULT_FIGURE_HEIGHT
        width = FIGURE_WIDTH.get(width_key, FIGURE_WIDTH["single_column"])
        height = DEFAULT_FIGURE_HEIGHT

        # Try template-based generation first
        template = _TEMPLATES.get(chart_type)
        if template and not critic_feedback:
            script = self._fill_template(
                template=template,
                chart_type=chart_type,
                data_source=data_source,
                condition_summaries=condition_summaries,
                metrics_summary=metrics_summary,
                experiment_results=experiment_results,
                metric_key=metric_key,
                output_path=output_path,
                title=title,
                x_label=x_label,
                y_label=y_label,
                width=width,
                height=height,
                width_key=width_key,
            )
            if script:
                return script

        # Fall back to LLM-generated script
        return self._llm_generate_script(
            fig_spec=fig_spec,
            chart_type=chart_type,
            condition_summaries=condition_summaries,
            metrics_summary=metrics_summary,
            experiment_results=experiment_results,
            metric_key=metric_key,
            output_path=output_path,
            width=width,
            height=height,
            critic_feedback=critic_feedback,
            width_key=width_key,
        )

    def _fill_template(
        self,
        *,
        template: str,
        chart_type: str,
        data_source: dict[str, Any],
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        experiment_results: dict[str, Any],
        metric_key: str,
        output_path: str,
        title: str,
        x_label: str,
        y_label: str,
        width: float,
        height: float,
        width_key: str = "single_column",
    ) -> str:
        """Fill a template with actual data values."""
        style_preamble = get_style_preamble(width_key=width_key)
        source_type = data_source.get("type", "condition_comparison")

        if chart_type in ("bar_comparison", "ablation_grouped"):
            return self._fill_bar_template(
                template=template,
                condition_summaries=condition_summaries,
                metric_key=data_source.get("metric", metric_key),
                output_path=output_path,
                title=title,
                x_label=x_label,
                y_label=y_label,
                width=width,
                height=height,
                style_preamble=style_preamble,
            )

        if chart_type == "grouped_bar" and source_type == "multi_metric":
            # BUG-37: LLM may return nested lists in metrics — flatten to list[str]
            _raw_metrics = data_source.get("metrics", [])
            _flat_metrics: list[str] = []
            for _mi in (_raw_metrics if isinstance(_raw_metrics, list) else []):
                if isinstance(_mi, str):
                    _flat_metrics.append(_mi)
                elif isinstance(_mi, list):
                    _flat_metrics.extend(str(x) for x in _mi)
                else:
                    _flat_metrics.append(str(_mi))
            return self._fill_grouped_bar_template(
                template=template,
                condition_summaries=condition_summaries,
                metrics=_flat_metrics,
                output_path=output_path,
                title=title,
                x_label=x_label,
                y_label=y_label,
                width=width,
                height=height,
                style_preamble=style_preamble,
            )

        if chart_type in ("heatmap", "confusion_matrix"):
            return self._fill_heatmap_template(
                template=template,
                condition_summaries=condition_summaries,
                metrics_summary=metrics_summary,
                output_path=output_path,
                title=title,
                x_label=x_label,
                y_label=y_label,
                width=width,
                height=height,
                style_preamble=style_preamble,
            )

        # For other types, fall through to LLM generation
        return ""

    def _fill_bar_template(
        self,
        *,
        template: str,
        condition_summaries: dict[str, Any],
        metric_key: str,
        output_path: str,
        title: str,
        x_label: str,
        y_label: str,
        width: float,
        height: float,
        style_preamble: str,
    ) -> str:
        """Fill bar comparison template with condition data."""
        conditions: list[str] = []
        values: list[float] = []
        ci_low: list[float] = []
        ci_high: list[float] = []

        for cond, cdata in condition_summaries.items():
            if not isinstance(cdata, dict):
                continue
            metrics = cdata.get("metrics", {})
            val = metrics.get(f"{metric_key}_mean") or metrics.get(metric_key)
            if val is None:
                continue
            try:
                fval = float(val)
            except (ValueError, TypeError):
                continue

            conditions.append(cond)
            values.append(fval)
            ci_low.append(float(cdata.get("ci95_low", fval)))
            ci_high.append(float(cdata.get("ci95_high", fval)))

        if not conditions:
            return ""

        # Skip degenerate data (all zeros, all identical)
        if _is_degenerate_data(values):
            logger.warning("Skipping degenerate bar chart: all values are identical or zero")
            return ""

        # Humanize empty/raw labels
        if not y_label or y_label.lower().replace("_", "") in ("primarymetric", "metric"):
            y_label = _humanize_label(metric_key)
        if not x_label:
            x_label = "Method"

        return template.format(
            style_preamble=style_preamble,
            conditions=repr(conditions),
            values=repr(values),
            ci_low=repr(ci_low),
            ci_high=repr(ci_high),
            output_path=output_path,
            title=_esc(title),
            x_label=_esc(x_label),
            y_label=_esc(y_label),
            width=width,
            height=height,
        )

    def _fill_grouped_bar_template(
        self,
        *,
        template: str,
        condition_summaries: dict[str, Any],
        metrics: list[str],
        output_path: str,
        title: str,
        x_label: str,
        y_label: str,
        width: float,
        height: float,
        style_preamble: str,
    ) -> str:
        """Fill grouped bar template with multi-metric data."""
        conditions: list[str] = list(condition_summaries.keys())
        if not conditions or not metrics:
            return ""

        data_matrix: list[list[float]] = []
        for cond in conditions:
            cdata = condition_summaries.get(cond, {})
            cmetrics = cdata.get("metrics", {}) if isinstance(cdata, dict) else {}
            row = []
            for m in metrics:
                val = cmetrics.get(f"{m}_mean") or cmetrics.get(m, 0)
                try:
                    row.append(float(val))
                except (ValueError, TypeError):
                    row.append(0.0)
            data_matrix.append(row)

        return template.format(
            style_preamble=style_preamble,
            conditions=repr(conditions),
            metric_names=repr(metrics),
            data_matrix=repr(data_matrix),
            output_path=output_path,
            title=_esc(title),
            x_label=_esc(x_label),
            y_label=_esc(y_label),
            width=width,
            height=height,
        )

    def _fill_heatmap_template(
        self,
        *,
        template: str,
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        output_path: str,
        title: str,
        x_label: str,
        y_label: str,
        width: float,
        height: float,
        style_preamble: str,
    ) -> str:
        """Fill heatmap template — rows=conditions, cols=metrics."""
        conditions = list(condition_summaries.keys())
        # Select non-timing metrics
        metric_names = [
            m for m in metrics_summary
            if not any(t in m.lower() for t in ["time", "elapsed", "seed", "runtime"])
        ][:8]

        if not conditions or not metric_names:
            return ""

        data_matrix: list[list[float]] = []
        for cond in conditions:
            cdata = condition_summaries.get(cond, {})
            cmetrics = cdata.get("metrics", {}) if isinstance(cdata, dict) else {}
            row = []
            for m in metric_names:
                val = cmetrics.get(f"{m}_mean") or cmetrics.get(m, 0)
                try:
                    row.append(round(float(val), 4))
                except (ValueError, TypeError):
                    row.append(0.0)
            data_matrix.append(row)

        # Skip degenerate heatmaps (all values identical)
        all_vals = [v for row in data_matrix for v in row]
        if _is_degenerate_data(all_vals):
            logger.warning("Skipping degenerate heatmap: all values are identical or zero")
            return ""

        # Also skip single-row heatmaps (meaningless)
        if len(conditions) < 2:
            logger.warning("Skipping heatmap with only %d row(s)", len(conditions))
            return ""

        return template.format(
            style_preamble=style_preamble,
            row_labels=repr(conditions),
            col_labels=repr(metric_names),
            data_matrix=repr(data_matrix),
            output_path=output_path,
            title=_esc(title),
            x_label=_esc(x_label or "Metric"),
            y_label=_esc(y_label or "Method"),
            width=max(width, len(metric_names) * 0.8),
            height=max(height, len(conditions) * 0.6),
        )

    # ------------------------------------------------------------------
    # LLM-based script generation
    # ------------------------------------------------------------------

    def _llm_generate_script(
        self,
        *,
        fig_spec: dict[str, Any],
        chart_type: str,
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        experiment_results: dict[str, Any],
        metric_key: str,
        output_path: str,
        width: float,
        height: float,
        critic_feedback: dict[str, Any] | None,
        width_key: str = "single_column",
    ) -> str:
        """Generate a plotting script using LLM."""
        if self._output_format == "latex":
            return self._llm_generate_latex(
                fig_spec=fig_spec,
                chart_type=chart_type,
                condition_summaries=condition_summaries,
                metrics_summary=metrics_summary,
                metric_key=metric_key,
                width=width,
                height=height,
                critic_feedback=critic_feedback,
            )

        style_preamble = get_style_preamble(width_key=width_key)

        system_prompt = (
            "You are an expert scientific visualization programmer. "
            "Generate a standalone Python script that creates a publication-quality "
            "matplotlib chart.\n\n"
            "RULES:\n"
            "- The script must be completely self-contained (no external imports "
            "beyond matplotlib, numpy, seaborn)\n"
            "- All data values must be hardcoded in the script (no file I/O)\n"
            "- Use the provided style preamble at the top of the script\n"
            "- Output format: PNG at 300 DPI\n"
            "- Use colorblind-safe colors from the COLORS list\n"
            "- Include descriptive axis labels and title\n"
            "- Use constrained_layout=True in plt.subplots() — do NOT call fig.tight_layout()\n"
            "- Call fig.savefig() and plt.close(fig) at the end\n"
            "- Print 'Saved: <path>' after saving\n"
            "- NEVER embed caption, description, or subtitle text inside the figure "
            "using fig.text() or ax.text() for long descriptions. "
            "All captions are added by LaTeX \\caption{}\n"
            "- Place legends OUTSIDE the data area when possible. "
            "Use bbox_to_anchor=(1.02, 1) with loc='upper left' for legends "
            "that would overlap bars or data points\n"
            "- Do NOT include any <think> or </think> tags\n\n"
            "Return ONLY the Python script, no explanation."
        )

        # Build data context (truncated to avoid token overflow)
        data_context = {
            "conditions": list(condition_summaries.keys())[:10],
            "metric_key": metric_key,
        }
        # Add condition values
        for cond, cdata in list(condition_summaries.items())[:10]:
            if isinstance(cdata, dict):
                data_context[cond] = {
                    "metrics": {k: v for k, v in (cdata.get("metrics") or {}).items()
                                if not any(t in k.lower()
                                           for t in ["time", "elapsed", "runtime"])},
                    "ci95_low": cdata.get("ci95_low"),
                    "ci95_high": cdata.get("ci95_high"),
                }

        user_prompt = (
            f"Style preamble (paste at top of script):\n```python\n{style_preamble}\n```\n\n"
            f"Figure specification:\n{json.dumps(fig_spec, indent=2)}\n\n"
            f"Experiment data:\n{json.dumps(data_context, indent=2, default=str)}\n\n"
            f"Output path: {output_path}\n"
            f"Figure size: ({width}, {height})\n"
        )

        if critic_feedback:
            user_prompt += (
                f"\n\nPREVIOUS ATTEMPT FAILED REVIEW. Fix these issues:\n"
                f"{json.dumps(critic_feedback.get('issues', []), indent=2)}\n"
            )

        raw = self._chat(system_prompt, user_prompt, max_tokens=4096, temperature=0.3)

        # Strip reasoning model thinking tags before parsing
        raw = strip_thinking_tags(raw)

        # Strip markdown fences
        script = self._strip_fences(raw)

        # Ensure style preamble is present
        if "matplotlib" not in script:
            script = style_preamble + "\n\n" + script

        return script

    def _llm_generate_latex(
        self,
        *,
        fig_spec: dict[str, Any],
        chart_type: str,
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        metric_key: str,
        width: float,
        height: float,
        critic_feedback: dict[str, Any] | None,
    ) -> str:
        """Generate LaTeX TikZ/PGFPlots code for a figure.

        This produces code that compiles directly in a LaTeX document that
        includes ``\\usepackage{pgfplots}`` and ``\\usepackage{tikz}``.
        """
        system_prompt = (
            "You are an expert scientific visualization programmer specializing "
            "in LaTeX/TikZ/PGFPlots.\n\n"
            "Generate LaTeX code using PGFPlots that creates a publication-quality "
            "chart suitable for a top-tier AI conference paper.\n\n"
            "RULES:\n"
            "- Use pgfplots (version ≥ 1.18) with \\pgfplotsset{compat=1.18}\n"
            "- All data values must be hardcoded in the LaTeX source\n"
            "- Use the colorbrewer palette or viridis colormap\n"
            "- Include descriptive axis labels and title\n"
            "- Wrap in a figure environment with \\caption and \\label\n"
            "- Font sizes should match: title 12pt, labels 10pt, ticks 9pt\n"
            "- Width should be \\columnwidth or 0.48\\textwidth for single column\n"
            "- Do NOT include any <think> or </think> tags\n\n"
            "Return ONLY the LaTeX code, no explanation."
        )

        # Build data context
        data_context = {
            "conditions": list(condition_summaries.keys())[:10],
            "metric_key": metric_key,
        }
        for cond, cdata in list(condition_summaries.items())[:10]:
            if isinstance(cdata, dict):
                data_context[cond] = {
                    "metrics": {k: v for k, v in (cdata.get("metrics") or {}).items()
                                if not any(t in k.lower()
                                           for t in ["time", "elapsed", "runtime"])},
                }

        user_prompt = (
            f"Chart type: {chart_type}\n"
            f"Figure specification:\n{json.dumps(fig_spec, indent=2)}\n\n"
            f"Experiment data:\n{json.dumps(data_context, indent=2, default=str)}\n\n"
            f"Figure dimensions: width={width}in, height={height}in\n"
        )

        if critic_feedback:
            user_prompt += (
                f"\n\nPREVIOUS ATTEMPT FAILED REVIEW. Fix these issues:\n"
                f"{json.dumps(critic_feedback.get('issues', []), indent=2)}\n"
            )

        raw = self._chat(system_prompt, user_prompt, max_tokens=4096, temperature=0.3)

        # Strip reasoning model thinking tags before parsing
        raw = strip_thinking_tags(raw)

        # Strip markdown fences (```latex ... ```)
        return self._strip_latex_fences(raw)

    @staticmethod
    def _strip_fences(text: str) -> str:
        """Remove markdown code fences from LLM output."""
        m = re.search(r"```(?:python)?\s*\n(.*?)```", text, re.DOTALL)
        if m:
            return m.group(1).strip()
        return text.strip()

    @staticmethod
    def _strip_latex_fences(text: str) -> str:
        """Remove markdown code fences from LaTeX LLM output."""
        m = re.search(r"```(?:latex|tex)?\s*\n(.*?)```", text, re.DOTALL)
        if m:
            return m.group(1).strip()
        return text.strip()


================================================
FILE: researchclaw/agents/figure_agent/critic.py
================================================
"""Critic Agent — tri-modal review of rendered charts.

Reviews each chart on three dimensions (inspired by PlotGen):
1. **Numerical accuracy** — verifies plotted values match source data
2. **Text correctness** — checks labels, legends, captions are accurate
3. **Visual quality** — LLM-based assessment of academic publication standards

Outputs pass/fail per figure with specific fix suggestions.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult

logger = logging.getLogger(__name__)


class CriticAgent(BaseAgent):
    """Reviews rendered charts for accuracy and quality."""

    name = "figure_critic"

    def __init__(
        self,
        llm: Any,
        *,
        strict_mode: bool = False,
    ) -> None:
        super().__init__(llm)
        self._strict = strict_mode

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Review all rendered figures.

        Context keys:
            rendered (list[dict]): From Renderer — each has 'figure_id',
                'success', 'output_path', 'script_path', 'title', 'caption'
            scripts (list[dict]): From CodeGen — has the source scripts
            condition_summaries (dict): Source data for numerical verification
            metrics_summary (dict): Source metrics
            metric_key (str): Primary metric name
        """
        try:
            rendered = context.get("rendered", [])
            scripts = context.get("scripts", [])
            condition_summaries = context.get("condition_summaries", {})
            metrics_summary = context.get("metrics_summary", {})
            metric_key = context.get("metric_key", "primary_metric")

            # Build script lookup
            script_map: dict[str, dict[str, Any]] = {}
            for s in scripts:
                # BUG-38: skip non-dict entries
                if not isinstance(s, dict):
                    self.logger.warning("Skipping non-dict script entry: %s", type(s))
                    continue
                script_map[s.get("figure_id", "")] = s

            reviews: list[dict[str, Any]] = []
            all_passed = True

            for fig in rendered:
                # BUG-38: skip non-dict entries
                if not isinstance(fig, dict):
                    self.logger.warning("Skipping non-dict rendered entry: %s", type(fig))
                    continue
                figure_id = fig.get("figure_id", "unknown")
                if not fig.get("success"):
                    reviews.append({
                        "figure_id": figure_id,
                        "passed": False,
                        "issues": [{"type": "render_failure", "message": fig.get("error", "Render failed")}],
                    })
                    all_passed = False
                    continue

                script_info = script_map.get(figure_id, {})
                script_code = script_info.get("script", "")

                review = self._review_figure(
                    figure_id=figure_id,
                    script_code=script_code,
                    fig_info=fig,
                    condition_summaries=condition_summaries,
                    metrics_summary=metrics_summary,
                    metric_key=metric_key,
                )
                reviews.append(review)
                if not review["passed"]:
                    all_passed = False

            passed_count = sum(1 for r in reviews if r["passed"])
            self.logger.info(
                "Critic review: %d/%d figures passed",
                passed_count, len(reviews),
            )

            return self._make_result(
                success=True,
                data={
                    "reviews": reviews,
                    "all_passed": all_passed,
                    "passed_count": passed_count,
                    "total_count": len(reviews),
                },
            )
        except Exception as exc:
            self.logger.error("Critic failed: %s", exc)
            return self._make_result(False, error=str(exc))

    # ------------------------------------------------------------------
    # Per-figure review
    # ------------------------------------------------------------------

    def _review_figure(
        self,
        *,
        figure_id: str,
        script_code: str,
        fig_info: dict[str, Any],
        condition_summaries: dict[str, Any],
        metrics_summary: dict[str, Any],
        metric_key: str,
    ) -> dict[str, Any]:
        """Review a single rendered figure on four dimensions."""
        issues: list[dict[str, str]] = []

        # Dimension 1: Numerical accuracy
        num_issues = self._check_numerical_accuracy(
            script_code, condition_summaries, metric_key
        )
        issues.extend(num_issues)

        # Dimension 2: Text correctness
        text_issues = self._check_text_correctness(
            script_code, fig_info
        )
        issues.extend(text_issues)

        # Dimension 3: Visual quality (LLM-based)
        quality_issues = self._check_visual_quality(
            script_code, fig_info
        )
        issues.extend(quality_issues)

        # Dimension 4: Rendered image validation (pixel-level)
        output_path = fig_info.get("output_path", "")
        if output_path:
            pixel_issues = self._check_rendered_image(output_path)
            issues.extend(pixel_issues)

        # Determine pass/fail
        critical_issues = [i for i in issues if i.get("severity") == "critical"]
        passed = len(critical_issues) == 0

        if self._strict:
            passed = len(issues) == 0

        return {
            "figure_id": figure_id,
            "passed": passed,
            "issues": issues,
            "issue_count": len(issues),
        }

    # ------------------------------------------------------------------
    # Dimension 1: Numerical accuracy
    # ------------------------------------------------------------------

    def _check_numerical_accuracy(
        self,
        script_code: str,
        condition_summaries: dict[str, Any],
        metric_key: str,
    ) -> list[dict[str, str]]:
        """Verify that data values in the script match source data."""
        issues: list[dict[str, str]] = []

        if not condition_summaries or not script_code:
            return issues

        # Extract numerical values from script
        script_numbers = set()
        for m in re.finditer(r"(\d+\.\d{2,})", script_code):
            try:
                script_numbers.add(round(float(m.group(1)), 4))
            except ValueError:
                pass

        # Extract expected values from condition summaries
        expected_values = set()
        for cond, cdata in condition_summaries.items():
            if not isinstance(cdata, dict):
                continue
            metrics = cdata.get("metrics", {})
            for key in [metric_key, f"{metric_key}_mean"]:
                val = metrics.get(key)
                if val is not None:
                    try:
                        expected_values.add(round(float(val), 4))
                    except (ValueError, TypeError):
                        pass

        if not expected_values:
            return issues

        # Check for degenerate data (all identical or all zero)
        vals_list = sorted(expected_values)
        if len(vals_list) >= 2 and len(set(round(v, 6) for v in vals_list)) <= 1:
            issues.append({
                "type": "numerical_accuracy",
                "severity": "critical",
                "message": "All expected metric values are identical — chart will be uninformative",
            })
        if all(v == 0 for v in vals_list):
            issues.append({
                "type": "numerical_accuracy",
                "severity": "critical",
                "message": "All expected metric values are zero — chart will show no meaningful data",
            })

        # Check if script contains the expected values
        found = expected_values & script_numbers
        missing = expected_values - script_numbers

        if missing and len(missing) > len(expected_values) / 2:
            issues.append({
                "type": "numerical_accuracy",
                "severity": "critical",
                "message": (
                    f"Script may not contain correct data values. "
                    f"Expected values like {list(missing)[:3]} not found in script. "
                    f"Found values: {list(found)[:5]}"
                ),
            })

        return issues

    # ------------------------------------------------------------------
    # Dimension 2: Text correctness
    # ------------------------------------------------------------------

    def _check_text_correctness(
        self,
        script_code: str,
        fig_info: dict[str, Any],
    ) -> list[dict[str, str]]:
        """Check labels, titles, and legends in the script."""
        issues: list[dict[str, str]] = []

        if not script_code:
            return issues

        # Check for axis labels
        has_xlabel = "set_xlabel" in script_code or "xlabel" in script_code
        has_ylabel = "set_ylabel" in script_code or "ylabel" in script_code
        has_title = "set_title" in script_code or ".title(" in script_code

        if not has_xlabel:
            issues.append({
                "type": "text_correctness",
                "severity": "warning",
                "message": "Missing x-axis label",
            })
        if not has_ylabel:
            issues.append({
                "type": "text_correctness",
                "severity": "warning",
                "message": "Missing y-axis label",
            })
        if not has_title:
            issues.append({
                "type": "text_correctness",
                "severity": "warning",
                "message": "Missing chart title",
            })

        # Check for savefig call
        if "savefig" not in script_code:
            issues.append({
                "type": "text_correctness",
                "severity": "critical",
                "message": "Missing fig.savefig() call — chart will not be saved",
            })

        # Check for plt.close to prevent memory leaks
        if "plt.close" not in script_code and "close()" not in script_code:
            issues.append({
                "type": "text_correctness",
                "severity": "warning",
                "message": "Missing plt.close() — may cause memory leaks",
            })

        return issues

    # ------------------------------------------------------------------
    # Dimension 3: Visual quality (LLM review)
    # ------------------------------------------------------------------

    def _check_visual_quality(
        self,
        script_code: str,
        fig_info: dict[str, Any],
    ) -> list[dict[str, str]]:
        """Use LLM to assess visual quality of the chart code."""
        if not script_code:
            return []

        system_prompt = (
            "You are an expert reviewer of scientific figures for AI conferences "
            "(NeurIPS, ICML, ICLR). Review the following matplotlib script and "
            "identify any quality issues.\n\n"
            "Check for:\n"
            "1. DPI setting (should be 300+ for publication)\n"
            "2. Font sizes (readable when printed: title ≥12pt, labels ≥10pt)\n"
            "3. Color choices (colorblind-safe, not default matplotlib)\n"
            "4. Layout (tight_layout or constrained_layout used)\n"
            "5. Grid and styling (clean, professional)\n"
            "6. Legend placement (visible, not overlapping data)\n"
            "7. Data representation (appropriate chart type for the data)\n\n"
            "Return a JSON object with:\n"
            "- quality_score: 1-10 (10 = publication ready)\n"
            "- issues: list of objects with 'type', 'severity' ('warning' or 'critical'), 'message'\n"
            "- If score >= 7 with no critical issues, the figure passes.\n"
        )

        user_prompt = (
            f"Chart title: {fig_info.get('title', 'Unknown')}\n"
            f"Chart caption: {fig_info.get('caption', '')}\n\n"
            f"Script:\n```python\n{script_code[:3000]}\n```"
        )

        result = self._chat_json(system_prompt, user_prompt, max_tokens=2048)

        issues: list[dict[str, str]] = []
        quality_score = result.get("quality_score", 5)

        for issue in result.get("issues", []):
            if isinstance(issue, dict) and issue.get("message"):
                issues.append({
                    "type": "visual_quality",
                    "severity": issue.get("severity", "warning"),
                    "message": str(issue["message"]),
                })

        if quality_score < 4:
            issues.append({
                "type": "visual_quality",
                "severity": "critical",
                "message": f"Overall quality score too low: {quality_score}/10",
            })

        return issues

    # ------------------------------------------------------------------
    # Dimension 4: Rendered image validation (pixel-level)
    # ------------------------------------------------------------------

    def _check_rendered_image(
        self, output_path: str
    ) -> list[dict[str, str]]:
        """Check the rendered PNG for visual defects via pixel analysis.

        Detects:
        - Near-blank images (>95% white) indicating degenerate/empty charts
        - Text/graphics touching image edges (possible label clipping)
        """
        issues: list[dict[str, str]] = []
        try:
            from PIL import Image
            import numpy as np

            img = Image.open(output_path).convert("RGB")
            arr = np.array(img)
            h, w, _ = arr.shape

            # Check 1: Near-blank image (>95% white pixels)
            white_mask = np.all(arr > 250, axis=2)
            white_ratio = float(np.mean(white_mask))
            if white_ratio > 0.95:
                issues.append({
                    "type": "rendered_quality",
                    "severity": "critical",
                    "message": (
                        f"Image is {white_ratio:.0%} white — likely degenerate "
                        f"or empty chart"
                    ),
                })

            # Check 2: Non-white pixels touching edges (possible clipping)
            margin = 3  # pixels
            for edge_name, edge_slice in [
                ("top", arr[:margin, :]),
                ("bottom", arr[-margin:, :]),
                ("left", arr[:, :margin]),
                ("right", arr[:, -margin:]),
            ]:
                dark_mask = np.any(edge_slice < 80, axis=-1)
                dark_ratio = float(np.mean(dark_mask))
                if dark_ratio > 0.05:
                    issues.append({
                        "type": "rendered_quality",
                        "severity": "warning",
                        "message": (
                            f"Content touching {edge_name} edge ({dark_ratio:.0%} "
                            f"dark pixels) — possible label/title clipping"
                        ),
                    })
        except ImportError:
            logger.debug("PIL not available — skipping rendered image checks")
        except Exception as exc:
            logger.debug("Rendered image check failed: %s", exc)

        return issues


================================================
FILE: researchclaw/agents/figure_agent/decision.py
================================================
"""Decision Agent — decides what figures are needed and how to generate them.

Analyzes the paper draft/outline and experiment data to determine:
  - Which sections need figures
  - What TYPE of figure each section needs
  - Which generation BACKEND to use:
    * ``code``  — Code-to-Viz (Matplotlib/TikZ) for data-driven charts
    * ``image`` — Nano Banana (Gemini) for architecture/conceptual diagrams

This agent acts as the "director" before the Planner/CodeGen/NanoBanana
sub-agents execute.  It does NOT generate any figures itself.

References:
  - Visual ChatGPT (Wu et al., 2023): LLM as controller
  - Nano Banana: Gemini native image generation (google.genai)
"""

from __future__ import annotations

import json
import logging
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult
from researchclaw.utils.thinking_tags import strip_thinking_tags

logger = logging.getLogger(__name__)


def _safe_priority(val: object, default: int = 2) -> int:
    """Convert priority to int, clamped to 1-3."""
    try:
        return max(1, min(3, int(val)))  # type: ignore[arg-type]
    except (ValueError, TypeError):
        return default

# ---------------------------------------------------------------------------
# Figure categories
# ---------------------------------------------------------------------------

FIGURE_CATEGORY_DATA = "code"   # data-driven → Matplotlib / TikZ
FIGURE_CATEGORY_IMAGE = "image"  # conceptual → Nano Banana (Gemini)

_DECISION_SYSTEM_PROMPT = """\
You are an expert academic paper analyst.  Your job is to analyze a research
paper's content and decide which figures are needed.

For each figure, decide:
1. **section** — Which section of the paper it belongs to (e.g. "Method",
   "Results", "Introduction", "Architecture")
2. **figure_type** — A descriptive type:
   - For data/experiment figures: "bar_comparison", "line_chart", "heatmap",
     "confusion_matrix", "training_curve", "ablation_chart", "scatter_plot"
   - For conceptual/architecture figures: "architecture_diagram",
     "method_flowchart", "pipeline_overview", "concept_illustration",
     "system_diagram", "attention_visualization", "comparison_illustration"
3. **backend** — Which generation backend:
   - "code" for data-driven charts (bar charts, line plots, heatmaps) → will
     be generated via Matplotlib/Seaborn or TikZ/PGFPlots
   - "image" for conceptual diagrams (architecture, pipeline, method) → will
     be generated via Gemini Nano Banana image generation
4. **description** — A detailed description of what the figure should show
5. **priority** — 1 (essential) to 3 (nice-to-have)

Return a JSON array of figure decisions.  Example:
```json
[
  {
    "section": "Method",
    "figure_type": "architecture_diagram",
    "backend": "image",
    "description": "Overview of the proposed model architecture showing encoder-decoder structure with attention mechanism",
    "priority": 1
  },
  {
    "section": "Results",
    "figure_type": "bar_comparison",
    "backend": "code",
    "description": "Bar chart comparing accuracy of proposed method vs baselines on CIFAR-100",
    "priority": 1
  }
]
```

RULES:
- Every research paper should have at least 1 architecture/method figure
- Every paper with experiments should have at least 2 result figures
- Prioritize figures that make the paper more convincing
- Do NOT generate duplicate or redundant figures
- Return ONLY valid JSON, no explanation
- Do NOT include <think> or </think> tags
"""


class FigureDecisionAgent(BaseAgent):
    """Decides what figures are needed and which backend generates them.

    This agent analyzes the paper context (topic, draft, experiment data)
    and produces a *figure decision plan* — a list of figure requests tagged
    with either ``"code"`` (Code-to-Viz) or ``"image"`` (Nano Banana).

    The downstream orchestrator then routes each request to the appropriate
    generation sub-agent.
    """

    name = "figure_decision"

    def __init__(
        self,
        llm: Any,
        *,
        min_figures: int = 3,
        max_figures: int = 10,
    ) -> None:
        super().__init__(llm)
        self._min_figures = min_figures
        self._max_figures = max_figures

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Analyze context and produce figure decisions.

        Context keys:
            topic (str): Research topic
            hypothesis (str): Research hypothesis
            paper_draft (str): Current paper draft / outline (markdown)
            experiment_results (dict): Parsed experiment data (if any)
            condition_summaries (dict): Per-condition stats (if any)
            has_experiments (bool): Whether experiments were conducted
        """
        topic = context.get("topic", "")
        hypothesis = context.get("hypothesis", "")
        paper_draft = context.get("paper_draft", "")
        has_experiments = context.get("has_experiments", True)
        experiment_results = context.get("experiment_results", {})
        condition_summaries = context.get("condition_summaries", {})

        # ── Try LLM-based decision ────────────────────────────────────
        if self._llm is not None:
            try:
                decisions = self._llm_decide(
                    topic=topic,
                    hypothesis=hypothesis,
                    paper_draft=paper_draft,
                    has_experiments=has_experiments,
                    experiment_results=experiment_results,
                    condition_summaries=condition_summaries,
                )
                # Enforce bounds
                decisions = self._enforce_bounds(decisions, has_experiments)

                return self._make_result(
                    True,
                    data={
                        "decisions": decisions,
                        "code_figures": [
                            d for d in decisions if d["backend"] == "code"
                        ],
                        "image_figures": [
                            d for d in decisions if d["backend"] == "image"
                        ],
                        "total": len(decisions),
                    },
                )
            except Exception as e:
                logger.warning("LLM decision failed, using heuristic: %s", e)

        # ── Fallback: heuristic decision ──────────────────────────────
        decisions = self._heuristic_decide(
            topic=topic,
            has_experiments=has_experiments,
            condition_summaries=condition_summaries,
        )

        return self._make_result(
            True,
            data={
                "decisions": decisions,
                "code_figures": [
                    d for d in decisions if d["backend"] == "code"
                ],
                "image_figures": [
                    d for d in decisions if d["backend"] == "image"
                ],
                "total": len(decisions),
            },
        )

    # ------------------------------------------------------------------
    # LLM-based decision
    # ------------------------------------------------------------------

    def _llm_decide(
        self,
        *,
        topic: str,
        hypothesis: str,
        paper_draft: str,
        has_experiments: bool,
        experiment_results: dict[str, Any],
        condition_summaries: dict[str, Any],
    ) -> list[dict[str, Any]]:
        """Ask LLM to analyze paper and decide on figures."""

        # Build user context
        user_parts = [
            f"Research topic: {topic}",
            f"Hypothesis: {hypothesis}",
        ]

        if paper_draft:
            # Truncate to avoid token overflow
            draft_preview = paper_draft[:4000]
            user_parts.append(f"\nPaper draft (preview):\n{draft_preview}")

        if has_experiments and condition_summaries:
            conditions_preview = json.dumps(
                {k: v for k, v in list(condition_summaries.items())[:8]},
                indent=2,
                default=str,
            )
            user_parts.append(
                f"\nExperiment conditions:\n{conditions_preview}"
            )

        if has_experiments and experiment_results:
            metrics = list(experiment_results.keys())[:20]
            user_parts.append(f"\nAvailable metrics: {metrics}")

        user_parts.append(
            f"\nConstraints: Generate between {self._min_figures} "
            f"and {self._max_figures} figures total."
        )

        user_prompt = "\n".join(user_parts)

        raw = self._chat(
            _DECISION_SYSTEM_PROMPT,
            user_prompt,
            max_tokens=2048,
            temperature=0.3,
        )

        # Strip reasoning model thinking tags before JSON parsing
        raw = strip_thinking_tags(raw)

        # Parse JSON response
        return self._parse_decisions(raw)

    def _parse_decisions(self, raw: str) -> list[dict[str, Any]]:
        """Parse LLM response into decision list."""
        import re

        # Strip markdown fences
        m = re.search(r"```(?:json)?\s*\n(.*?)```", raw, re.DOTALL)
        text = m.group(1).strip() if m else raw.strip()

        # Find JSON array
        start = text.find("[")
        end = text.rfind("]")
        if start == -1 or end == -1:
            raise ValueError("No JSON array found in LLM response")

        decisions_raw = json.loads(text[start : end + 1])

        # Validate and normalize
        decisions = []
        for d in decisions_raw:
            if not isinstance(d, dict):
                continue
            decision = {
                "section": str(d.get("section", "Results")),
                "figure_type": str(d.get("figure_type", "bar_comparison")),
                "backend": str(d.get("backend", "code")),
                "description": str(d.get("description", "")),
                "priority": _safe_priority(d.get("priority", 2)),
            }
            # Validate backend
            if decision["backend"] not in ("code", "image"):
                # Auto-assign based on figure_type
                decision["backend"] = self._infer_backend(
                    decision["figure_type"]
                )
            decisions.append(decision)

        return decisions

    # ------------------------------------------------------------------
    # Heuristic fallback
    # ------------------------------------------------------------------

    def _heuristic_decide(
        self,
        *,
        topic: str,
        has_experiments: bool,
        condition_summaries: dict[str, Any],
    ) -> list[dict[str, Any]]:
        """Generate figure decisions without LLM (rule-based fallback)."""
        decisions: list[dict[str, Any]] = []

        # Always suggest an architecture/method diagram
        decisions.append({
            "section": "Method",
            "figure_type": "architecture_diagram",
            "backend": "image",
            "description": (
                f"Architecture overview diagram for the proposed method "
                f"in the paper about: {topic[:100]}"
            ),
            "priority": 1,
        })

        if has_experiments:
            # Main results comparison
            n_conditions = len(condition_summaries)
            decisions.append({
                "section": "Results",
                "figure_type": "bar_comparison",
                "backend": "code",
                "description": (
                    f"Bar chart comparing main metric across "
                    f"{n_conditions} experimental conditions"
                ),
                "priority": 1,
            })

            # Training/convergence curve
            decisions.append({
                "section": "Results",
                "figure_type": "training_curve",
                "backend": "code",
                "description": "Training convergence curves with loss/metric over epochs",
                "priority": 2,
            })

            # Ablation study
            if n_conditions >= 4:
                decisions.append({
                    "section": "Results",
                    "figure_type": "bar_comparison",
                    "backend": "code",
                    "description": "Ablation study showing contribution of each component",
                    "priority": 2,
                })

        # Pipeline/method flowchart
        decisions.append({
            "section": "Method",
            "figure_type": "pipeline_overview",
            "backend": "image",
            "description": (
                f"Step-by-step pipeline flowchart showing the method's "
                f"workflow for: {topic[:100]}"
            ),
            "priority": 2,
        })

        return decisions[:self._max_figures]

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _infer_backend(figure_type: str) -> str:
        """Infer generation backend from figure type."""
        code_types = {
            "bar_comparison", "line_chart", "heatmap", "confusion_matrix",
            "training_curve", "ablation_chart", "scatter_plot", "line_multi",
            "grouped_bar", "loss_curve",
        }
        if figure_type in code_types:
            return "code"
        return "image"

    def _enforce_bounds(
        self,
        decisions: list[dict[str, Any]],
        has_experiments: bool,
    ) -> list[dict[str, Any]]:
        """Enforce min/max figure counts and required categories."""
        # Sort by priority (1 = highest)
        decisions.sort(key=lambda d: d.get("priority", 2))

        # Ensure at least one architecture figure
        has_image = any(d["backend"] == "image" for d in decisions)
        if not has_image:
            decisions.insert(0, {
                "section": "Method",
                "figure_type": "architecture_diagram",
                "backend": "image",
                "description": "Model architecture overview",
                "priority": 1,
            })

        # Ensure at least one data figure if experiments exist
        if has_experiments:
            has_code = any(d["backend"] == "code" for d in decisions)
            if not has_code:
                decisions.append({
                    "section": "Results",
                    "figure_type": "bar_comparison",
                    "backend": "code",
                    "description": "Main results comparison",
                    "priority": 1,
                })

        # Enforce bounds
        if len(decisions) < self._min_figures:
            # Pad with lower-priority suggestions
            while len(decisions) < self._min_figures:
                decisions.append({
                    "section": "Discussion",
                    "figure_type": "concept_illustration",
                    "backend": "image",
                    "description": "Conceptual illustration of key findings",
                    "priority": 3,
                })

        return decisions[:self._max_figures]


================================================
FILE: researchclaw/agents/figure_agent/integrator.py
================================================
"""Integrator Agent — determines figure placement in the paper.

Maps each rendered figure to the correct paper section, generates
markdown image references with captions, and produces a
``figure_manifest.json`` that downstream stages use for paper embedding.
"""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult

logger = logging.getLogger(__name__)

# Mapping from figure section → paper section heading
_SECTION_MAP = {
    "method": "Method",
    "methods": "Method",
    "methodology": "Method",
    "architecture": "Method",
    "results": "Results",
    "experiment": "Results",
    "experiments": "Results",
    "analysis": "Analysis",
    "discussion": "Discussion",
    "ablation": "Results",
    "introduction": "Introduction",
}


class IntegratorAgent(BaseAgent):
    """Determines figure placement and generates paper references."""

    name = "figure_integrator"

    def __init__(self, llm: Any) -> None:
        super().__init__(llm)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Generate figure manifest and markdown references.

        Context keys:
            rendered (list[dict]): Successfully rendered figures with
                'figure_id', 'output_path', 'title', 'caption', 'section'
            topic (str): Research topic
            output_dir (str|Path): Charts directory
        """
        try:
            rendered = context.get("rendered", [])
            topic = context.get("topic", "")
            output_dir = Path(context.get("output_dir", "charts"))

            # Filter to successfully rendered figures only
            successful = [r for r in rendered if r.get("success")]

            if not successful:
                return self._make_result(
                    True,
                    data={"manifest": [], "markdown_refs": "", "figure_count": 0},
                )

            # Build manifest
            manifest = self._build_manifest(successful, output_dir)

            # Generate markdown references for paper embedding
            markdown_refs = self._generate_markdown_refs(manifest)

            # Generate figure descriptions for paper writing prompt
            figure_descriptions = self._generate_descriptions(manifest)

            # Save manifest
            manifest_path = output_dir / "figure_manifest.json"
            manifest_path.write_text(
                json.dumps(manifest, indent=2, ensure_ascii=False),
                encoding="utf-8",
            )
            self.logger.info(
                "Generated figure manifest: %d figures", len(manifest)
            )

            return self._make_result(
                True,
                data={
                    "manifest": manifest,
                    "markdown_refs": markdown_refs,
                    "figure_descriptions": figure_descriptions,
                    "figure_count": len(manifest),
                    "manifest_path": str(manifest_path),
                },
            )
        except Exception as exc:
            self.logger.error("Integrator failed: %s", exc)
            return self._make_result(False, error=str(exc))

    # ------------------------------------------------------------------
    # Manifest building
    # ------------------------------------------------------------------

    def _build_manifest(
        self,
        rendered: list[dict[str, Any]],
        output_dir: Path,
    ) -> list[dict[str, Any]]:
        """Build a structured manifest of all figures."""
        manifest: list[dict[str, Any]] = []

        # Sort by priority: priority 1 (must-have) first
        sorted_figs = sorted(
            rendered,
            key=lambda f: (
                self._section_order(f.get("section", "results")),
                f.get("priority", 2),
            ),
        )

        for i, fig in enumerate(sorted_figs, 1):
            figure_id = fig.get("figure_id", f"fig_{i}")
            output_path = fig.get("output_path", "")
            section = fig.get("section", "results")
            paper_section = _SECTION_MAP.get(section.lower(), "Results")

            # Relative path for paper embedding (charts/filename.png)
            if output_path:
                rel_path = f"charts/{Path(output_path).name}"
            else:
                rel_path = f"charts/{figure_id}.png"

            entry = {
                "figure_number": i,
                "figure_id": figure_id,
                "file_path": rel_path,
                "absolute_path": output_path,
                "title": fig.get("title", f"Figure {i}"),
                "caption": fig.get("caption", ""),
                "paper_section": paper_section,
                "width": fig.get("width", "single_column"),
                "label": f"fig:{figure_id}",
                "script_path": fig.get("script_path", ""),
            }
            manifest.append(entry)

        return manifest

    @staticmethod
    def _section_order(section: str) -> int:
        """Order sections for figure numbering."""
        order = {
            "introduction": 0,
            "method": 1,
            "methods": 1,
            "methodology": 1,
            "architecture": 1,
            "results": 2,
            "experiment": 2,
            "experiments": 2,
            "ablation": 3,
            "analysis": 4,
            "discussion": 5,
        }
        return order.get(section.lower(), 3)

    # ------------------------------------------------------------------
    # Markdown reference generation
    # ------------------------------------------------------------------

    def _generate_markdown_refs(
        self, manifest: list[dict[str, Any]]
    ) -> str:
        """Generate markdown image references for paper embedding."""
        refs: list[str] = []

        for entry in manifest:
            fig_num = entry["figure_number"]
            file_path = entry["file_path"]
            caption = entry.get("caption") or entry.get("title", f"Figure {fig_num}")
            refs.append(
                f"![Figure {fig_num}: {caption}]({file_path})"
            )

        return "\n\n".join(refs)

    # ------------------------------------------------------------------
    # Description generation for paper writing prompt
    # ------------------------------------------------------------------

    def _generate_descriptions(
        self, manifest: list[dict[str, Any]]
    ) -> str:
        """Generate figure descriptions for injection into paper writing prompt."""
        parts: list[str] = []
        parts.append("## AVAILABLE FIGURES (embed in the paper)")
        parts.append(
            "The following figures were generated from actual experiment data. "
            "Reference them in the appropriate paper sections using markdown "
            "image syntax: `![Caption](charts/filename.png)`\n"
        )

        for entry in manifest:
            fig_num = entry["figure_number"]
            file_path = entry["file_path"]
            title = entry.get("title", "")
            caption = entry.get("caption", "")
            section = entry.get("paper_section", "Results")

            parts.append(
                f"**Figure {fig_num}** (`{file_path}`) — {title}\n"
                f"  Caption: {caption}\n"
                f"  Place in: **{section}** section\n"
            )

        parts.append(
            "\nFor each figure referenced, write a descriptive caption and "
            "discuss what the figure shows in 2-3 sentences.\n"
        )

        return "\n".join(parts)


================================================
FILE: researchclaw/agents/figure_agent/nano_banana.py
================================================
"""Nano Banana Agent — generates conceptual/architectural images via Gemini.

Uses Google's Gemini native image generation (Nano Banana) to create
non-data figures such as:
  - Model architecture diagrams
  - Method pipeline flowcharts
  - System overview illustrations
  - Concept/intuition diagrams

These figures complement the Code-to-Viz agent which handles data-driven
charts (bar plots, line charts, heatmaps, etc.).

Requires: ``pip install google-genai Pillow``
API key:  Set ``GEMINI_API_KEY`` or ``GOOGLE_API_KEY`` env var, or
          pass via config.

References:
  - Nano Banana docs: https://ai.google.dev/gemini-api/docs/image-generation
  - Gemini 3.1 Flash Image Preview: high-efficiency, high-volume
  - Gemini 3 Pro Image Preview: professional asset production
"""

from __future__ import annotations

import base64
import json
import logging
import os
import re
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult
from researchclaw.utils.sanitize import sanitize_figure_id

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Default config
# ---------------------------------------------------------------------------

_DEFAULT_MODEL = "gemini-2.5-flash-image"
_FALLBACK_MODELS = [
    "gemini-3.1-flash-image-preview",
    "gemini-3-pro-image-preview",
    "gemini-2.5-flash-image",
]

_ACADEMIC_STYLE_PROMPT = (
    "The image should be in a clean, professional ACADEMIC style suitable "
    "for a top-tier AI/ML research paper (NeurIPS, ICML, ICLR). "
    "Use a white or light background. Use clear labels and annotations. "
    "Avoid excessive decoration. Use a consistent color palette. "
    "Text should be legible at column width (~3.25 inches). "
    "Style: technical illustration, vector-like, clean lines."
)


class NanoBananaAgent(BaseAgent):
    """Generates conceptual/architectural figures using Gemini image generation.

    This agent uses the Gemini API (Nano Banana) to create publication-quality
    conceptual figures that complement data-driven charts from Code-to-Viz.
    """

    name = "nano_banana"

    def __init__(
        self,
        llm: Any,
        *,
        gemini_api_key: str | None = None,
        model: str = _DEFAULT_MODEL,
        output_dir: str | Path | None = None,
        aspect_ratio: str = "16:9",
        use_sdk: bool | None = None,  # None = auto-detect
    ) -> None:
        super().__init__(llm)
        self._api_key = (
            gemini_api_key
            or os.environ.get("GEMINI_API_KEY")
            or os.environ.get("GOOGLE_API_KEY")
            or ""
        )
        self._model = model
        self._output_dir = Path(output_dir) if output_dir else None
        self._aspect_ratio = aspect_ratio

        # Detect SDK availability
        self._use_sdk = use_sdk
        if self._use_sdk is None:
            try:
                import google.genai  # noqa: F401
                self._use_sdk = True
            except ImportError:
                self._use_sdk = False

        if not self._api_key:
            logger.warning(
                "No Gemini API key found. Set GEMINI_API_KEY or "
                "GOOGLE_API_KEY env var for Nano Banana image generation."
            )

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Generate images for figure decisions marked as 'image' backend.

        Context keys:
            image_figures (list[dict]): Decisions from FigureDecisionAgent
                                       with backend="image"
            topic (str): Research topic
            output_dir (str|Path): Output directory for images
        """
        image_figures = context.get("image_figures", [])
        topic = context.get("topic", "")
        output_dir = Path(
            context.get("output_dir", self._output_dir or "charts")
        )
        output_dir.mkdir(parents=True, exist_ok=True)

        if not image_figures:
            # BUG-DA8-10: Use _make_result() to properly track LLM counters
            return self._make_result(
                success=True,
                data={"generated": [], "count": 0},
            )

        if not self._api_key:
            return self._make_result(
                success=False,
                error="No Gemini API key configured for Nano Banana",
                data={"generated": [], "count": 0},
            )

        generated: list[dict[str, Any]] = []

        for i, fig in enumerate(image_figures):
            figure_id = sanitize_figure_id(
                fig.get("figure_id", f"conceptual_{i + 1}")
            )
            description = fig.get("description", "")
            figure_type = fig.get("figure_type", "architecture_diagram")
            section = fig.get("section", "Method")

            # Build prompt for Gemini
            prompt = self._build_prompt(
                description=description,
                figure_type=figure_type,
                section=section,
                topic=topic,
            )

            # Generate image
            output_path = output_dir / f"{figure_id}.png"

            try:
                success = self._generate_image(
                    prompt=prompt,
                    output_path=output_path,
                )

                if success:
                    generated.append({
                        "figure_id": figure_id,
                        "figure_type": figure_type,
                        "section": section,
                        "description": description,
                        "output_path": str(output_path),
                        "path": str(output_path),
                        "title": description[:80] if description else f"Figure {figure_id}",
                        "caption": description or "",
                        "prompt": prompt,
                        "success": True,
                        "backend": "nano_banana",
                    })
                    logger.info(
                        "Generated %s: %s", figure_id, output_path
                    )
                else:
                    generated.append({
                        "figure_id": figure_id,
                        "success": False,
                        "error": "Generation returned no image",
                        "backend": "nano_banana",
                    })

            except Exception as e:
                logger.warning(
                    "Failed to generate %s via Nano Banana: %s",
                    figure_id, e,
                )
                generated.append({
                    "figure_id": figure_id,
                    "success": False,
                    "error": str(e),
                    "backend": "nano_banana",
                })

        success_count = sum(1 for g in generated if g.get("success"))

        return self._make_result(
            success=success_count > 0,
            data={
                "generated": generated,
                "count": success_count,
                "total_attempted": len(image_figures),
            },
        )

    # ------------------------------------------------------------------
    # Prompt building
    # ------------------------------------------------------------------

    def _build_prompt(
        self,
        *,
        description: str,
        figure_type: str,
        section: str,
        topic: str,
    ) -> str:
        """Build a Gemini prompt for academic figure generation."""
        type_guidelines = self._get_type_guidelines(figure_type)

        prompt = (
            f"Create a professional academic figure for the '{section}' "
            f"section of a research paper about: {topic}\n\n"
            f"Figure description: {description}\n\n"
            f"Style guidelines:\n{type_guidelines}\n\n"
            f"{_ACADEMIC_STYLE_PROMPT}\n\n"
            f"The figure must be publication-ready for a top-tier "
            f"AI/ML conference paper."
        )

        return prompt

    @staticmethod
    def _get_type_guidelines(figure_type: str) -> str:
        """Get specific guidelines for each figure type."""
        guidelines = {
            "architecture_diagram": (
                "- Show the model layers, connections, and data flow\n"
                "- Use boxes for layers/modules with clear labels\n"
                "- Use arrows to show data flow direction\n"
                "- Include dimensions/shapes where relevant\n"
                "- Group related components with dashed borders\n"
                "- Use a consistent left-to-right or top-to-bottom flow"
            ),
            "method_flowchart": (
                "- Show the step-by-step process flow\n"
                "- Use rounded rectangles for processes\n"
                "- Use diamonds for decision points\n"
                "- Use arrows with labels for transitions\n"
                "- Number the steps if sequential\n"
                "- Highlight key/novel steps with color"
            ),
            "pipeline_overview": (
                "- Show the full pipeline from input to output\n"
                "- Use distinct visual blocks for each stage\n"
                "- Include example inputs/outputs at each stage\n"
                "- Use consistent arrow style for data flow\n"
                "- Label each stage clearly\n"
                "- Show parallel/branching paths if applicable"
            ),
            "concept_illustration": (
                "- Illustrate the key concept or intuition\n"
                "- Use simple, clean diagrams\n"
                "- Include before/after or problem/solution comparison\n"
                "- Use visual metaphors where appropriate\n"
                "- Keep it simple enough to understand at a glance"
            ),
            "system_diagram": (
                "- Show the overall system architecture\n"
                "- Include all major components and their interactions\n"
                "- Use standard UML-like notation where appropriate\n"
                "- Show data stores, APIs, and external services\n"
                "- Include protocols/data formats for connections"
            ),
            "attention_visualization": (
                "- Show attention weights or patterns\n"
                "- Use heatmap-style coloring for attention scores\n"
                "- Include input/output sequences\n"
                "- Label attention heads if multi-head attention\n"
                "- Use clear color scale legend"
            ),
            "comparison_illustration": (
                "- Show side-by-side comparison of approaches\n"
                "- Highlight key differences with visual cues\n"
                "- Use consistent styling across comparisons\n"
                "- Include labels for each approach\n"
                "- Use checkmarks/crosses for feature comparison"
            ),
        }
        return guidelines.get(figure_type, guidelines["concept_illustration"])

    # ------------------------------------------------------------------
    # Image generation backends
    # ------------------------------------------------------------------

    def _generate_image(
        self,
        prompt: str,
        output_path: Path,
    ) -> bool:
        """Generate image via Gemini API.

        Tries google-genai SDK first, falls back to REST API.
        """
        if self._use_sdk:
            return self._generate_via_sdk(prompt, output_path)
        return self._generate_via_rest(prompt, output_path)

    def _generate_via_sdk(
        self,
        prompt: str,
        output_path: Path,
    ) -> bool:
        """Generate image using google-genai SDK."""
        try:
            from google import genai
            from google.genai import types

            client = genai.Client(api_key=self._api_key)

            response = client.models.generate_content(
                model=self._model,
                contents=[prompt],
                config=types.GenerateContentConfig(
                    response_modalities=["IMAGE"],
                    image_config=types.ImageConfig(
                        aspect_ratio=self._aspect_ratio,
                    ),
                ),
            )

            for part in response.parts:
                if part.inline_data is not None:
                    image = part.as_image()
                    image.save(str(output_path))
                    return True

            logger.warning("Gemini SDK returned no image data")
            return False

        except ImportError:
            logger.warning("google-genai SDK not installed, falling back to REST")
            self._use_sdk = False
            return self._generate_via_rest(prompt, output_path)
        except Exception as e:
            logger.warning("Gemini SDK error: %s, falling back to REST", e)
            return self._generate_via_rest(prompt, output_path)

    def _generate_via_rest(
        self,
        prompt: str,
        output_path: Path,
    ) -> bool:
        """Generate image using Gemini REST API (no SDK dependency)."""
        # Validate model name to prevent URL injection
        if not re.fullmatch(r"[a-zA-Z0-9._-]+", self._model):
            logger.error("Invalid Gemini model name: %r", self._model)
            return False

        url = (
            f"https://generativelanguage.googleapis.com/v1beta/"
            f"models/{self._model}:generateContent"
        )

        payload = {
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {
                "responseModalities": ["IMAGE"],
                "imageConfig": {
                    "aspectRatio": self._aspect_ratio,
                },
            },
        }

        data = json.dumps(payload).encode("utf-8")
        req = urllib.request.Request(
            url,
            data=data,
            headers={
                "Content-Type": "application/json",
                "x-goog-api-key": self._api_key,
            },
            method="POST",
        )

        try:
            with urllib.request.urlopen(req, timeout=120) as resp:
                result = json.loads(resp.read().decode("utf-8"))

            # Extract image from response
            candidates = result.get("candidates", [])
            if not candidates:
                logger.warning("Gemini REST API returned no candidates")
                return False

            parts = candidates[0].get("content", {}).get("parts", [])
            for part in parts:
                inline_data = part.get("inlineData", {})
                if inline_data.get("mimeType", "").startswith("image/"):
                    image_bytes = base64.b64decode(inline_data["data"])
                    output_path.write_bytes(image_bytes)
                    return True

            logger.warning("Gemini REST API returned no image parts")
            return False

        except urllib.error.HTTPError as e:
            body = e.read().decode("utf-8", errors="replace")[:500]
            logger.warning("Gemini REST API error %d: %s", e.code, body)
            return False
        except Exception as e:
            logger.warning("Gemini REST API error: %s", e)
            return False


================================================
FILE: researchclaw/agents/figure_agent/orchestrator.py
================================================
"""FigureAgent Orchestrator — coordinates the figure generation sub-agents.

Flow:
  Decision Agent → analyzes paper → decides what figures are needed
    ├── code figures  → Planner → CodeGen → Renderer → Critic → retry
    └── image figures → Nano Banana (Gemini image generation)
  → Integrator (combines all figures into manifest)

Produces a ``FigurePlan`` consumed by paper draft and export stages.
"""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from researchclaw.agents.base import AgentOrchestrator
from researchclaw.agents.figure_agent.codegen import CodeGenAgent
from researchclaw.agents.figure_agent.critic import CriticAgent
from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
from researchclaw.agents.figure_agent.integrator import IntegratorAgent
from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
from researchclaw.agents.figure_agent.planner import PlannerAgent
from researchclaw.agents.figure_agent.renderer import RendererAgent

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class FigureAgentConfig:
    """Configuration for the FigureAgent system."""

    enabled: bool = True
    # Planner
    min_figures: int = 3
    max_figures: int = 8
    # Orchestrator
    max_iterations: int = 3   # max CodeGen→Renderer→Critic retry loops
    # Renderer security
    render_timeout_sec: int = 30
    use_docker: bool | None = None  # None = auto-detect
    docker_image: str = "researchclaw/experiment:latest"
    # Code generation
    output_format: str = "python"  # "python" or "latex"
    # Nano Banana (Gemini image generation)
    gemini_api_key: str = ""  # or set GEMINI_API_KEY env var
    gemini_model: str = "gemini-2.5-flash-image"
    nano_banana_enabled: bool = True  # enable/disable image generation
    # Critic
    strict_mode: bool = False  # if True, any issue = fail
    # Output
    dpi: int = 300


# ---------------------------------------------------------------------------
# Output data structure
# ---------------------------------------------------------------------------


@dataclass
class FigurePlan:
    """Final output from the FigureAgent system.

    Consumed by:
    - Paper draft stage (figure_descriptions for writing prompt)
    - Paper export stage (manifest for LaTeX figure embedding)
    - Charts directory (scripts + rendered images)
    """

    # Figure manifest (list of figure metadata dicts)
    manifest: list[dict[str, Any]] = field(default_factory=list)

    # Generated references
    markdown_refs: str = ""
    figure_descriptions: str = ""

    # Paths
    output_dir: str = ""
    manifest_path: str = ""

    # Stats
    figure_count: int = 0
    passed_count: int = 0
    total_llm_calls: int = 0
    total_tokens: int = 0
    elapsed_sec: float = 0.0

    def to_dict(self) -> dict[str, Any]:
        """Serialize to a JSON-safe dict."""
        return {
            "manifest": self.manifest,
            "markdown_refs": self.markdown_refs,
            "figure_descriptions": self.figure_descriptions,
            "output_dir": self.output_dir,
            "manifest_path": self.manifest_path,
            "figure_count": self.figure_count,
            "passed_count": self.passed_count,
            "total_llm_calls": self.total_llm_calls,
            "total_tokens": self.total_tokens,
            "elapsed_sec": self.elapsed_sec,
        }

    def get_chart_files(self) -> list[str]:
        """Return list of chart filenames from manifest."""
        return [
            Path(entry["file_path"]).name
            for entry in self.manifest
            if entry.get("file_path")
        ]


# ---------------------------------------------------------------------------
# Orchestrator
# ---------------------------------------------------------------------------


class FigureOrchestrator(AgentOrchestrator):
    """Coordinates Decision → (Code-to-Viz | Nano Banana) → Integrator."""

    def __init__(
        self,
        llm: Any,
        config: FigureAgentConfig | None = None,
        *,
        stage_dir: Path | None = None,
    ) -> None:
        cfg = config or FigureAgentConfig()
        super().__init__(llm, max_iterations=cfg.max_iterations)

        self._config = cfg
        self._stage_dir = stage_dir

        # Decision agent
        self._decision = FigureDecisionAgent(
            llm,
            min_figures=cfg.min_figures,
            max_figures=cfg.max_figures,
        )

        # Code-to-Viz sub-agents (for data-driven charts)
        self._planner = PlannerAgent(
            llm,
            min_figures=cfg.min_figures,
            max_figures=cfg.max_figures,
        )
        # BUG-60: Pass use_docker so CodeGen generates container-aware paths
        self._codegen = CodeGenAgent(
            llm, output_format=cfg.output_format,
            use_docker=bool(cfg.use_docker) if cfg.use_docker is not None else False,
        )
        self._renderer = RendererAgent(
            llm,
            timeout_sec=cfg.render_timeout_sec,
            use_docker=cfg.use_docker,
            docker_image=cfg.docker_image,
        )
        self._critic = CriticAgent(
            llm,
            strict_mode=cfg.strict_mode,
        )

        # Nano Banana agent (for conceptual/architectural images)
        self._nano_banana: NanoBananaAgent | None = None
        if cfg.nano_banana_enabled:
            self._nano_banana = NanoBananaAgent(
                llm,
                gemini_api_key=cfg.gemini_api_key or None,
                model=cfg.gemini_model,
            )

        self._integrator = IntegratorAgent(llm)

    def _save_artifact(self, name: str, data: Any) -> None:
        """Save intermediate artifact to stage directory."""
        if self._stage_dir is None:
            return
        self._stage_dir.mkdir(parents=True, exist_ok=True)
        path = self._stage_dir / name
        if isinstance(data, str):
            path.write_text(data, encoding="utf-8")
        else:
            path.write_text(
                json.dumps(data, indent=2, ensure_ascii=False, default=str),
                encoding="utf-8",
            )

    def orchestrate(self, context: dict[str, Any]) -> FigurePlan:
        """Run the full figure generation pipeline.

        Context keys:
            experiment_results (dict): Parsed results.json
            condition_summaries (dict): Per-condition aggregated stats
            metrics_summary (dict): Per-metric aggregated stats
            metric_key (str): Primary metric name
            topic (str): Research topic
            hypothesis (str): Research hypothesis
            paper_draft (str): Current paper draft (for decision agent)
            output_dir (str|Path): Directory for chart output
        """
        t0 = time.monotonic()
        topic = context.get("topic", "")
        output_dir = Path(context.get("output_dir", "charts"))
        output_dir.mkdir(parents=True, exist_ok=True)

        self.logger.info("FigureAgent starting for: %s", topic[:80])

        plan = FigurePlan(output_dir=str(output_dir))

        # ── Phase 0: Decision — what figures are needed? ──────────────
        self.logger.info("Phase 0: Deciding what figures are needed")
        decision_result = self._decision.execute({
            "topic": topic,
            "hypothesis": context.get("hypothesis", ""),
            "paper_draft": context.get("paper_draft", ""),
            "has_experiments": bool(context.get("experiment_results")),
            "experiment_results": context.get("experiment_results", {}),
            "condition_summaries": context.get("condition_summaries", {}),
        })
        self._accumulate(decision_result)
        self._save_artifact("figure_decisions.json", decision_result.data)

        code_figures = decision_result.data.get("code_figures", [])
        image_figures = decision_result.data.get("image_figures", [])

        self.logger.info(
            "Decision: %d code figures, %d image figures",
            len(code_figures), len(image_figures),
        )

        # Track all rendered figures (from both backends)
        all_rendered: list[dict[str, Any]] = []

        # ── Phase A: Code-to-Viz for data figures ─────────────────────
        if code_figures:
            rendered_code = self._run_code_pipeline(
                code_figures=code_figures,
                context=context,
                output_dir=output_dir,
            )
            all_rendered.extend(rendered_code)

        # ── Phase B: Nano Banana for image figures ────────────────────
        if image_figures and self._nano_banana is not None:
            rendered_images = self._run_nano_banana(
                image_figures=image_figures,
                context=context,
                output_dir=output_dir,
            )
            all_rendered.extend(rendered_images)
        elif image_figures:
            self.logger.warning(
                "Nano Banana disabled — skipping %d image figures",
                len(image_figures),
            )

        # ── Phase C: Integrate all figures ────────────────────────────
        self.logger.info(
            "Phase C: Integrating %d figures into paper", len(all_rendered)
        )
        integrate_result = self._integrator.execute({
            "rendered": all_rendered,
            "topic": topic,
            "output_dir": str(output_dir),
        })
        self._accumulate(integrate_result)

        # ── Finalize ─────────────────────────────────────────────────
        plan.manifest = integrate_result.data.get("manifest", [])
        plan.markdown_refs = integrate_result.data.get("markdown_refs", "")
        plan.figure_descriptions = integrate_result.data.get("figure_descriptions", "")
        plan.manifest_path = integrate_result.data.get("manifest_path", "")
        plan.figure_count = integrate_result.data.get("figure_count", 0)
        plan.passed_count = sum(
            1 for r in all_rendered if r.get("success")
        )
        plan.total_llm_calls = self.total_llm_calls
        plan.total_tokens = self.total_tokens
        plan.elapsed_sec = time.monotonic() - t0

        # Save final plan
        self._save_artifact("figure_plan_final.json", plan.to_dict())

        self.logger.info(
            "FigureAgent complete: %d figures (%d code + %d image), "
            "%d passed, %d LLM calls, %.1fs",
            plan.figure_count,
            len(code_figures),
            len(image_figures),
            plan.passed_count,
            plan.total_llm_calls,
            plan.elapsed_sec,
        )

        return plan

    # ------------------------------------------------------------------
    # Code-to-Viz pipeline (data-driven charts)
    # ------------------------------------------------------------------

    def _run_code_pipeline(
        self,
        code_figures: list[dict[str, Any]],
        context: dict[str, Any],
        output_dir: Path,
    ) -> list[dict[str, Any]]:
        """Run Planner → CodeGen → Renderer → Critic for data figures."""

        # Phase 1: Plan (uses experiment data)
        self.logger.info("Phase A1: Planning data figures")
        plan_result = self._planner.execute({
            "experiment_results": context.get("experiment_results", {}),
            "topic": context.get("topic", ""),
            "hypothesis": context.get("hypothesis", ""),
            "conditions": context.get("conditions", []),
            "metric_key": context.get("metric_key", "primary_metric"),
            "metrics_summary": context.get("metrics_summary", {}),
            "condition_summaries": context.get("condition_summaries", {}),
        })
        self._accumulate(plan_result)

        if not plan_result.success:
            self.logger.warning("Planning failed: %s", plan_result.error)
            return []

        figures = plan_result.data.get("figures", [])
        self._save_artifact("figure_plan_code.json", figures)
        self.logger.info("Planned %d data figures", len(figures))

        # Phase 2+3+4: CodeGen → Render → Critic (with retry)
        critic_feedback: list[dict[str, Any]] = []
        final_rendered: list[dict[str, Any]] = []

        for iteration in range(self.max_iterations):
            self.logger.info(
                "Phase A2: CodeGen (iteration %d/%d)",
                iteration + 1, self.max_iterations,
            )

            # CodeGen
            codegen_result = self._codegen.execute({
                "figures": figures,
                "experiment_results": context.get("experiment_results", {}),
                "condition_summaries": context.get("condition_summaries", {}),
                "metrics_summary": context.get("metrics_summary", {}),
                "metric_key": context.get("metric_key", "primary_metric"),
                "output_dir": str(output_dir),
                "critic_feedback": critic_feedback,
            })
            self._accumulate(codegen_result)

            if not codegen_result.success:
                self.logger.warning("CodeGen failed: %s", codegen_result.error)
                continue

            scripts = codegen_result.data.get("scripts", [])
            self._save_artifact(f"scripts_{iteration}.json", [
                {k: v for k, v in s.items() if k != "script"}
                for s in scripts
            ])

            # Render
            self.logger.info(
                "Phase A3: Rendering (iteration %d/%d)",
                iteration + 1, self.max_iterations,
            )
            render_result = self._renderer.execute({
                "scripts": scripts,
                "output_dir": str(output_dir),
            })
            self._accumulate(render_result)

            if not render_result.success:
                self.logger.warning("Rendering failed: %s", render_result.error)
                continue

            rendered = render_result.data.get("rendered", [])
            # Merge newly rendered figures with previously passed figures
            # (on retries, only failed figures are re-rendered)
            if iteration == 0:
                final_rendered = rendered
            else:
                # Replace entries for re-rendered figures, keep previously passed ones
                re_rendered_ids = {r.get("figure_id") for r in rendered}
                final_rendered = [
                    r for r in final_rendered if r.get("figure_id") not in re_rendered_ids
                ] + rendered

            # Critic
            self.logger.info(
                "Phase A4: Critic review (iteration %d/%d)",
                iteration + 1, self.max_iterations,
            )
            critic_result = self._critic.execute({
                "rendered": rendered,
                "scripts": scripts,
                "condition_summaries": context.get("condition_summaries", {}),
                "metrics_summary": context.get("metrics_summary", {}),
                "metric_key": context.get("metric_key", "primary_metric"),
            })
            self._accumulate(critic_result)

            reviews = critic_result.data.get("reviews", [])
            all_passed = critic_result.data.get("all_passed", False)
            self._save_artifact(f"reviews_{iteration}.json", reviews)

            if all_passed:
                self.logger.info(
                    "All data figures passed review on iteration %d",
                    iteration + 1,
                )
                break

            # Collect feedback for failed figures
            critic_feedback = [
                r for r in reviews if not r.get("passed")
            ]

            # Only retry figures that failed
            # BUG-37: figure_id may be non-hashable (list) — force str
            failed_ids = set()
            for r in critic_feedback:
                _fid = r.get("figure_id")
                if isinstance(_fid, str):
                    failed_ids.add(_fid)
                elif isinstance(_fid, list) and _fid:
                    failed_ids.add(str(_fid[0]))
            figures = [f for f in figures if f.get("figure_id") in failed_ids]

            self.logger.warning(
                "Critic: %d/%d figures need revision",
                len(failed_ids), len(rendered),
            )

        return final_rendered

    # ------------------------------------------------------------------
    # Nano Banana pipeline (conceptual/architectural images)
    # ------------------------------------------------------------------

    def _run_nano_banana(
        self,
        image_figures: list[dict[str, Any]],
        context: dict[str, Any],
        output_dir: Path,
    ) -> list[dict[str, Any]]:
        """Run Nano Banana for conceptual/architectural figures."""
        if self._nano_banana is None:
            return []

        self.logger.info(
            "Phase B: Generating %d image figures via Nano Banana",
            len(image_figures),
        )

        # Assign figure IDs
        for i, fig in enumerate(image_figures):
            if "figure_id" not in fig:
                fig["figure_id"] = (
                    f"{fig.get('figure_type', 'conceptual')}_{i + 1}"
                )

        nb_result = self._nano_banana.execute({
            "image_figures": image_figures,
            "topic": context.get("topic", ""),
            "output_dir": str(output_dir),
        })
        self._accumulate(nb_result)
        self._save_artifact("nano_banana_results.json", nb_result.data)

        generated = nb_result.data.get("generated", [])
        success_count = nb_result.data.get("count", 0)

        self.logger.info(
            "Nano Banana: %d/%d images generated successfully",
            success_count, len(image_figures),
        )

        return generated


================================================
FILE: researchclaw/agents/figure_agent/planner.py
================================================
"""Planner Agent — analyzes experiment results and determines chart plan.

Examines the experiment results data structure, research topic, and paper
idea to decide:
- How many figures to generate
- What type each figure should be (bar, line, heatmap, etc.)
- What data each figure should display
- Caption specifications for each figure
- Layout (single / subplot / multi-panel)
"""

from __future__ import annotations

import json
import logging
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Chart type decision matrix — maps experiment characteristics to chart types
# ---------------------------------------------------------------------------

_CHART_TYPE_MATRIX: dict[str, list[dict[str, str]]] = {
    "classification": [
        {"type": "bar_comparison", "purpose": "accuracy comparison across methods"},
        {"type": "confusion_matrix", "purpose": "per-class prediction analysis"},
        {"type": "training_curve", "purpose": "convergence behavior"},
    ],
    "generation": [
        {"type": "line_multi", "purpose": "FID/IS curves over training"},
        {"type": "bar_comparison", "purpose": "generation quality metrics comparison"},
    ],
    "reinforcement_learning": [
        {"type": "training_curve", "purpose": "reward curve with mean±std shading"},
        {"type": "bar_comparison", "purpose": "final performance comparison"},
    ],
    "knowledge_distillation": [
        {"type": "bar_comparison", "purpose": "teacher-student accuracy comparison"},
        {"type": "line_multi", "purpose": "knowledge transfer efficiency curve"},
        {"type": "heatmap", "purpose": "feature alignment heatmap"},
    ],
    "nlp": [
        {"type": "bar_comparison", "purpose": "BLEU/ROUGE metric comparison"},
        {"type": "heatmap", "purpose": "attention heatmap"},
    ],
    "graph_neural_networks": [
        {"type": "bar_comparison", "purpose": "node classification accuracy"},
        {"type": "training_curve", "purpose": "convergence on graph tasks"},
    ],
    "meta_learning": [
        {"type": "line_multi", "purpose": "few-shot accuracy vs number of shots"},
        {"type": "bar_comparison", "purpose": "cross-task performance comparison"},
    ],
    "continual_learning": [
        {"type": "line_multi", "purpose": "forgetting rate curve across tasks"},
        {"type": "heatmap", "purpose": "task accuracy matrix"},
    ],
    "optimization": [
        {"type": "training_curve", "purpose": "convergence speed comparison"},
        {"type": "line_multi", "purpose": "loss landscape analysis"},
    ],
    "default": [
        {"type": "bar_comparison", "purpose": "main results comparison across methods"},
        {"type": "training_curve", "purpose": "training convergence"},
    ],
}

# Keywords for domain detection
_DOMAIN_KEYWORDS: dict[str, list[str]] = {
    "classification": ["classif", "accuracy", "cifar", "imagenet", "image recognition"],
    "generation": ["generat", "gan", "diffusion", "vae", "fid", "inception score"],
    "reinforcement_learning": ["reinforcement", "reward", "policy", "gymnasium", "mujoco", "atari"],
    "knowledge_distillation": ["distill", "teacher", "student", "knowledge transfer"],
    "nlp": ["bleu", "rouge", "language model", "translation", "summariz"],
    "graph_neural_networks": ["graph", "node classif", "gnn", "gcn", "message passing"],
    "meta_learning": ["meta-learn", "few-shot", "maml", "prototyp"],
    "continual_learning": ["continual", "lifelong", "catastrophic forgetting", "incremental"],
    "optimization": ["optim", "convergence", "learning rate", "sgd", "adam"],
}


class PlannerAgent(BaseAgent):
    """Analyzes experiment data and generates a figure plan."""

    name = "figure_planner"

    def __init__(
        self,
        llm: Any,
        *,
        min_figures: int = 3,
        max_figures: int = 8,
    ) -> None:
        super().__init__(llm)
        self._min_figures = min_figures
        self._max_figures = max_figures

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Generate a figure plan from experiment results.

        Context keys:
            experiment_results (dict): Parsed results.json / experiment_summary
            topic (str): Research topic
            hypothesis (str): Research hypothesis
            conditions (list[str]): Experiment condition names
            metric_key (str): Primary metric name
            metrics_summary (dict): Per-metric aggregated statistics
            condition_summaries (dict): Per-condition aggregated statistics
        """
        try:
            results = context.get("experiment_results", {})
            topic = context.get("topic", "")
            metric_key = context.get("metric_key", "primary_metric")
            conditions = context.get("conditions", [])
            metrics_summary = context.get("metrics_summary", {})
            condition_summaries = context.get("condition_summaries", {})

            # Step 1: Detect research domain
            domain = self._detect_domain(topic)
            self.logger.info("Detected research domain: %s", domain)

            # Step 2: Analyze available data
            data_analysis = self._analyze_data(
                results, conditions, metrics_summary, condition_summaries, metric_key
            )

            # Step 3: Generate figure plan via LLM
            figure_plan = self._generate_plan(
                topic=topic,
                domain=domain,
                data_analysis=data_analysis,
                metric_key=metric_key,
                conditions=conditions,
            )

            return self._make_result(True, data={
                "figures": figure_plan,
                "domain": domain,
                "data_analysis": data_analysis,
            })
        except Exception as exc:
            self.logger.error("Planner failed: %s", exc)
            return self._make_result(False, error=str(exc))

    # ------------------------------------------------------------------
    # Domain detection
    # ------------------------------------------------------------------

    def _detect_domain(self, topic: str) -> str:
        """Detect research domain from topic string."""
        topic_lower = topic.lower()
        scores: dict[str, int] = {}
        for domain, keywords in _DOMAIN_KEYWORDS.items():
            score = sum(1 for kw in keywords if kw in topic_lower)
            if score > 0:
                scores[domain] = score
        if scores:
            return max(scores, key=scores.get)  # type: ignore[arg-type]
        return "default"

    # ------------------------------------------------------------------
    # Data analysis
    # ------------------------------------------------------------------

    def _analyze_data(
        self,
        results: dict[str, Any],
        conditions: list[str],
        metrics_summary: dict[str, Any],
        condition_summaries: dict[str, Any],
        metric_key: str,
    ) -> dict[str, Any]:
        """Analyze available experiment data to determine chart potential."""
        analysis: dict[str, Any] = {
            "num_conditions": len(conditions),
            "conditions": conditions[:10],
            "num_metrics": len(metrics_summary),
            "metric_names": list(metrics_summary.keys())[:15],
            "has_training_history": False,
            "has_per_condition_data": bool(condition_summaries),
            "has_ablation": False,
            "has_multiple_seeds": False,
            "primary_metric": metric_key,
        }

        # Check for training history data
        for key in results:
            if any(t in str(key).lower() for t in ["history", "curve", "epoch", "step"]):
                analysis["has_training_history"] = True
                break

        # Check for ablation conditions
        for cond in conditions:
            cond_lower = cond.lower()
            if any(t in cond_lower for t in ["ablat", "without", "no_", "reduced", "remove"]):
                analysis["has_ablation"] = True
                break

        # Check for multi-seed data
        for cond_data in condition_summaries.values():
            if isinstance(cond_data, dict):
                n_seeds = cond_data.get("n_seeds", 0)
                if n_seeds and int(n_seeds) > 1:
                    analysis["has_multiple_seeds"] = True
                    break

        # Extract key metric values per condition
        condition_values: dict[str, float] = {}
        for cond, cdata in condition_summaries.items():
            if isinstance(cdata, dict):
                metrics = cdata.get("metrics", {})
                val = metrics.get(f"{metric_key}_mean") or metrics.get(metric_key)
                if val is not None:
                    try:
                        condition_values[cond] = float(val)
                    except (ValueError, TypeError):
                        pass
        analysis["condition_values"] = condition_values

        return analysis

    # ------------------------------------------------------------------
    # Plan generation
    # ------------------------------------------------------------------

    def _generate_plan(
        self,
        *,
        topic: str,
        domain: str,
        data_analysis: dict[str, Any],
        metric_key: str,
        conditions: list[str],
    ) -> list[dict[str, Any]]:
        """Use LLM to generate a detailed figure plan."""
        # Get domain-specific chart suggestions
        domain_charts = _CHART_TYPE_MATRIX.get(domain, _CHART_TYPE_MATRIX["default"])

        system_prompt = (
            "You are an expert scientific visualization advisor. "
            "Given experiment data from an ML research paper, you plan which "
            "figures to include in the paper.\n\n"
            "RULES:\n"
            f"- Generate between {self._min_figures} and {self._max_figures} figures\n"
            "- Each figure must serve a distinct purpose\n"
            "- At minimum include: 1 main results comparison + 1 ablation/analysis figure\n"
            "- If training history data exists, include a training curve\n"
            "- Figures should tell a coherent story about the research contributions\n"
            "- Do NOT generate figures for data that doesn't exist\n"
            "- Caption should be precise and descriptive (not generic)\n\n"
            "Available chart types: bar_comparison, grouped_bar, training_curve, "
            "loss_curve, heatmap, scatter_plot, violin_box, ablation_grouped, "
            "line_multi, radar_chart\n\n"
            "Return a JSON object with key 'figures' containing a list of figure "
            "specifications. Each figure spec must have:\n"
            "- figure_id: string (e.g. 'fig_main_results')\n"
            "- chart_type: one of the available types\n"
            "- title: short title for the chart\n"
            "- caption: detailed caption text (1-2 sentences)\n"
            "- data_source: what data to plot (metric names, conditions)\n"
            "- x_label: x-axis label\n"
            "- y_label: y-axis label\n"
            "- width: 'single_column' or 'double_column'\n"
            "- priority: 1 (must-have) to 3 (nice-to-have)\n"
            "- section: which paper section ('method', 'results', 'analysis')\n"
        )

        user_prompt = (
            f"Research topic: {topic}\n"
            f"Domain: {domain}\n"
            f"Primary metric: {metric_key}\n"
            f"Number of conditions: {data_analysis['num_conditions']}\n"
            f"Conditions: {', '.join(data_analysis.get('conditions', []))}\n"
            f"Available metrics: {', '.join(data_analysis.get('metric_names', []))}\n"
            f"Has training history: {data_analysis.get('has_training_history', False)}\n"
            f"Has ablation conditions: {data_analysis.get('has_ablation', False)}\n"
            f"Has multiple seeds: {data_analysis.get('has_multiple_seeds', False)}\n"
            f"Condition values: {json.dumps(data_analysis.get('condition_values', {}))}\n\n"
            f"Suggested chart types for this domain:\n"
        )
        for chart in domain_charts:
            user_prompt += f"- {chart['type']}: {chart['purpose']}\n"
        user_prompt += "\nGenerate the figure plan JSON."

        result = self._chat_json(system_prompt, user_prompt, max_tokens=4096)

        figures = result.get("figures", [])
        if not figures:
            # Fallback: generate a basic plan from domain matrix
            self.logger.warning("LLM returned no figures, using domain-based fallback")
            figures = self._fallback_plan(domain, data_analysis, metric_key, conditions)

        # Ensure minimum figure count
        if len(figures) < self._min_figures:
            self.logger.info(
                "LLM returned %d figures (min %d), adding defaults",
                len(figures), self._min_figures,
            )
            figures = self._augment_plan(figures, data_analysis, metric_key, conditions)

        # Cap at max
        figures = figures[:self._max_figures]

        # BUG-36: LLM may return figures as list of strings instead of dicts
        figures = [f for f in figures if isinstance(f, dict)]

        # Assign IDs if missing
        for i, fig in enumerate(figures):
            if not fig.get("figure_id"):
                fig["figure_id"] = f"fig_{i + 1}"

        return figures

    # ------------------------------------------------------------------
    # Fallback plan (no LLM needed)
    # ------------------------------------------------------------------

    def _fallback_plan(
        self,
        domain: str,
        data_analysis: dict[str, Any],
        metric_key: str,
        conditions: list[str],
    ) -> list[dict[str, Any]]:
        """Generate a basic plan without LLM (used as fallback)."""
        figures: list[dict[str, Any]] = []

        # Always include a main results comparison
        if data_analysis["num_conditions"] >= 2:
            figures.append({
                "figure_id": "fig_main_results",
                "chart_type": "bar_comparison",
                "title": "Method Comparison",
                "caption": f"Comparison of {metric_key.replace('_', ' ')} across all evaluated methods. "
                           f"Error bars show 95% confidence intervals.",
                "data_source": {"type": "condition_comparison", "metric": metric_key},
                "x_label": "Method",
                "y_label": metric_key.replace("_", " ").title(),
                "width": "single_column",
                "priority": 1,
                "section": "results",
            })

        # Ablation grouped bar if ablation exists
        if data_analysis.get("has_ablation"):
            figures.append({
                "figure_id": "fig_ablation",
                "chart_type": "ablation_grouped",
                "title": "Ablation Study",
                "caption": "Ablation study showing the contribution of each component. "
                           "Removing each component independently reveals its importance.",
                "data_source": {"type": "ablation_comparison", "metric": metric_key},
                "x_label": "Variant",
                "y_label": metric_key.replace("_", " ").title(),
                "width": "single_column",
                "priority": 1,
                "section": "results",
            })

        # Training curve if history exists
        if data_analysis.get("has_training_history"):
            figures.append({
                "figure_id": "fig_training_curve",
                "chart_type": "training_curve",
                "title": "Training Convergence",
                "caption": "Training loss curves for all methods. "
                           "Shaded regions indicate standard deviation across seeds.",
                "data_source": {"type": "training_history"},
                "x_label": "Epoch",
                "y_label": "Loss",
                "width": "single_column",
                "priority": 2,
                "section": "results",
            })

        # Multi-metric comparison if multiple metrics
        if data_analysis["num_metrics"] > 2:
            metrics_to_show = [
                m for m in data_analysis.get("metric_names", [])
                if m != metric_key and not any(
                    t in m.lower() for t in ["time", "elapsed", "seed", "runtime"]
                )
            ][:5]
            if metrics_to_show:
                figures.append({
                    "figure_id": "fig_multi_metric",
                    "chart_type": "grouped_bar",
                    "title": "Multi-Metric Comparison",
                    "caption": "Performance comparison across multiple evaluation metrics.",
                    "data_source": {"type": "multi_metric", "metrics": metrics_to_show},
                    "x_label": "Method",
                    "y_label": "Score",
                    "width": "double_column",
                    "priority": 2,
                    "section": "analysis",
                })

        return figures

    def _augment_plan(
        self,
        existing: list[dict[str, Any]],
        data_analysis: dict[str, Any],
        metric_key: str,
        conditions: list[str],
    ) -> list[dict[str, Any]]:
        """Add default figures to meet minimum count."""
        # BUG-37: chart_type may be non-hashable (list) — force str
        existing_types = {
            f.get("chart_type") for f in existing
            if isinstance(f.get("chart_type"), str)
        }
        augmented = list(existing)

        # Add main comparison if missing
        if "bar_comparison" not in existing_types and data_analysis["num_conditions"] >= 2:
            augmented.append({
                "figure_id": "fig_main_results",
                "chart_type": "bar_comparison",
                "title": "Method Comparison",
                "caption": f"Comparison of {metric_key.replace('_', ' ')} across all methods.",
                "data_source": {"type": "condition_comparison", "metric": metric_key},
                "x_label": "Method",
                "y_label": metric_key.replace("_", " ").title(),
                "width": "single_column",
                "priority": 1,
                "section": "results",
            })

        # Add ablation if applicable and missing
        if (
            "ablation_grouped" not in existing_types
            and data_analysis.get("has_ablation")
        ):
            augmented.append({
                "figure_id": "fig_ablation",
                "chart_type": "ablation_grouped",
                "title": "Ablation Study",
                "caption": "Ablation analysis showing component contributions.",
                "data_source": {"type": "ablation_comparison", "metric": metric_key},
                "x_label": "Variant",
                "y_label": metric_key.replace("_", " ").title(),
                "width": "single_column",
                "priority": 1,
                "section": "results",
            })

        return augmented


================================================
FILE: researchclaw/agents/figure_agent/renderer.py
================================================
"""Renderer Agent — executes plotting scripts and verifies output.

Runs generated Python scripts in a subprocess (or Docker sandbox when
available), captures stdout/stderr, verifies output files exist with
correct format, and returns rendered image paths.

Security: When Docker is available, visualization code is executed inside
an isolated container (``--network none``) to prevent RCE from LLM-generated
code.  Falls back to a local subprocess when Docker is not available.

Architecture ref: Visual ChatGPT (Wu et al., 2023) — LLMs as controllers
calling deterministic render tools instead of generating pixels directly.
"""

from __future__ import annotations

import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Any

from researchclaw.agents.base import BaseAgent, AgentStepResult
from researchclaw.utils.sanitize import sanitize_figure_id

logger = logging.getLogger(__name__)

# Minimum acceptable file size (bytes) — filters out corrupt/empty PNGs
_MIN_FILE_SIZE = 1024  # 1 KB

# Docker image for sandboxed visualization rendering.
# The experiment image already has matplotlib, numpy, seaborn pre-installed.
_VIZ_DOCKER_IMAGE = "researchclaw/experiment:latest"


def _docker_available() -> bool:
    """Return True if Docker daemon is reachable."""
    try:
        cp = subprocess.run(
            ["docker", "info"],
            capture_output=True,
            timeout=10,
            check=False,
        )
        return cp.returncode == 0
    except (FileNotFoundError, subprocess.TimeoutExpired):
        return False


class RendererAgent(BaseAgent):
    """Executes plotting scripts and verifies output files.

    Supports two execution modes:
      1. **Docker sandbox** (preferred): Runs scripts inside an isolated
         container with ``--network none`` to prevent RCE.
      2. **Local subprocess** (fallback): Direct execution when Docker
         is unavailable.

    The mode is auto-detected at instantiation time but can be forced via
    the ``use_docker`` parameter.
    """

    name = "figure_renderer"

    def __init__(
        self,
        llm: Any,
        *,
        timeout_sec: int = 30,
        python_path: str | None = None,
        use_docker: bool | None = None,
        docker_image: str | None = None,
    ) -> None:
        super().__init__(llm)
        self._timeout = timeout_sec
        self._python = python_path or sys.executable
        self._docker_image = docker_image or _VIZ_DOCKER_IMAGE

        # Auto-detect Docker availability if not explicitly set
        if use_docker is None:
            self._use_docker = _docker_available()
        else:
            self._use_docker = use_docker

        if self._use_docker:
            self.logger.info(
                "RendererAgent: Docker sandbox ENABLED (image=%s)",
                self._docker_image,
            )
        else:
            self.logger.warning(
                "RendererAgent: Docker sandbox DISABLED — LLM-generated "
                "scripts will run as LOCAL subprocesses WITHOUT sandboxing. "
                "Set use_docker=True or install Docker for secure execution."
            )

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def execute(self, context: dict[str, Any]) -> AgentStepResult:
        """Execute plotting scripts and verify outputs.

        Context keys:
            scripts (list[dict]): From CodeGen — each has 'figure_id',
                'script', 'output_filename'
            output_dir (str|Path): Directory for output charts and scripts
        """
        try:
            scripts = context.get("scripts", [])
            output_dir = Path(context.get("output_dir", "charts")).resolve()
            output_dir.mkdir(parents=True, exist_ok=True)
            scripts_dir = output_dir / "scripts"
            scripts_dir.mkdir(parents=True, exist_ok=True)

            results: list[dict[str, Any]] = []

            for script_info in scripts:
                figure_id = script_info.get("figure_id", "unknown")
                script_code = script_info.get("script", "")
                output_filename = script_info.get("output_filename", f"{figure_id}.png")

                result = self._render_one(
                    figure_id=figure_id,
                    script_code=script_code,
                    output_filename=output_filename,
                    output_dir=output_dir,
                    scripts_dir=scripts_dir,
                )
                result["title"] = script_info.get("title", "")
                result["caption"] = script_info.get("caption", "")
                result["section"] = script_info.get("section", "results")
                result["width"] = script_info.get("width", "single_column")
                results.append(result)

            success_count = sum(1 for r in results if r["success"])
            self.logger.info(
                "Rendered %d/%d figures successfully",
                success_count, len(scripts),
            )

            return self._make_result(
                success=success_count > 0,
                data={"rendered": results, "output_dir": str(output_dir)},
                error="" if success_count > 0 else "All renders failed",
            )
        except Exception as exc:
            self.logger.error("Renderer failed: %s", exc)
            return self._make_result(False, error=str(exc))

    # ------------------------------------------------------------------
    # Per-figure rendering
    # ------------------------------------------------------------------

    def _render_one(
        self,
        *,
        figure_id: str,
        script_code: str,
        output_filename: str,
        output_dir: Path,
        scripts_dir: Path,
    ) -> dict[str, Any]:
        """Render a single figure script."""
        figure_id = sanitize_figure_id(figure_id)
        output_filename = sanitize_figure_id(
            output_filename.replace(".png", ""), fallback="figure"
        ) + ".png"
        result: dict[str, Any] = {
            "figure_id": figure_id,
            "success": False,
            "output_path": "",
            "script_path": "",
            "error": "",
        }

        if not script_code.strip():
            result["error"] = "Empty script"
            return result

        # Save script for reproducibility
        script_path = scripts_dir / f"{figure_id}.py"

        # BUG-60: When running in Docker, rewrite absolute host paths to
        # Docker-mapped paths.  Generated scripts use savefig() with absolute
        # host paths (e.g. /home/user/.../charts/fig.png) but inside Docker
        # the output dir is mounted at /workspace/output.
        if self._use_docker:
            import re as _re_path
            _host_out = str(output_dir.resolve())
            # Replace host output dir with Docker-mapped path
            script_code = script_code.replace(_host_out, "/workspace/output")
            # Also catch any other absolute paths pointing to output_dir parent
            script_code = _re_path.sub(
                r'savefig\(["\'](?:/[^"\']*/)(' + _re_path.escape(output_filename) + r')["\']',
                r'savefig("/workspace/output/\1"',
                script_code,
            )

        script_path.write_text(script_code, encoding="utf-8")
        result["script_path"] = str(script_path)

        # Choose execution backend
        if self._use_docker:
            proc_result = self._execute_in_docker(
                script_path=script_path,
                output_dir=output_dir,
                figure_id=figure_id,
            )
        else:
            proc_result = self._execute_local(
                script_path=script_path,
                output_dir=output_dir,
            )

        if proc_result["error"]:
            result["error"] = proc_result["error"]
            self.logger.warning(
                "Render failed for %s: %s", figure_id, result["error"][:200]
            )
            return result

        # Verify output file exists
        output_path = output_dir / output_filename
        if not output_path.exists():
            # Check if it was saved relative to script CWD
            alt_path = output_dir.parent / output_dir.name / output_filename
            if alt_path.exists():
                output_path = alt_path
            else:
                result["error"] = f"Output file not found: {output_path}"
                self.logger.warning("Output missing for %s", figure_id)
                return result

        # Verify file size
        file_size = output_path.stat().st_size
        if file_size < _MIN_FILE_SIZE:
            result["error"] = f"Output file too small ({file_size} bytes)"
            self.logger.warning(
                "Output too small for %s: %d bytes", figure_id, file_size
            )
            return result

        result["success"] = True
        result["output_path"] = str(output_path)
        result["file_size"] = file_size
        self.logger.info("Rendered %s: %s (%d bytes)", figure_id, output_path, file_size)
        return result

    # ------------------------------------------------------------------
    # Execution backends
    # ------------------------------------------------------------------

    def _execute_local(
        self,
        *,
        script_path: Path,
        output_dir: Path,
    ) -> dict[str, str]:
        """Execute script in a local subprocess (no sandbox)."""
        try:
            proc = subprocess.run(
                [self._python, str(script_path.resolve())],
                capture_output=True,
                text=True,
                timeout=self._timeout,
                # BUG-20: Use output_dir as CWD so relative paths
                # like fig.savefig("comparison.png") resolve correctly
                cwd=str(output_dir.resolve()),
            )
        except subprocess.TimeoutExpired:
            return {"error": f"Script timed out after {self._timeout}s"}
        except FileNotFoundError:
            return {"error": f"Python executable not found: {self._python}"}

        if proc.returncode != 0:
            stderr = proc.stderr[:2000] if proc.stderr else "Unknown error"
            return {"error": f"Script failed (exit {proc.returncode}): {stderr}"}

        return {"error": ""}

    def _execute_in_docker(
        self,
        *,
        script_path: Path,
        output_dir: Path,
        figure_id: str,
    ) -> dict[str, str]:
        """Execute script inside an isolated Docker container.

        Security measures:
        - ``--network none``: No network access (prevents data exfiltration)
        - ``--read-only``: Root filesystem is read-only
        - ``--tmpfs /tmp``: Writable /tmp only in-memory
        - ``--memory 512m``: Hard memory limit
        - Volume mounts are restricted to the output directory
        - Script is bind-mounted read-only
        - Container is auto-removed after execution

        This prevents RCE from LLM-generated visualization code.
        """
        import uuid as _uuid_renderer
        container_name = f"rc-viz-{figure_id}-{os.getpid()}-{_uuid_renderer.uuid4().hex[:8]}"

        cmd = [
            "docker", "run",
            "--name", container_name,
            "--rm",
            "--network", "none",
            "--read-only",
            "--tmpfs", "/tmp:rw,noexec,nosuid,size=64m",
            f"--memory=512m",
            "-e", "MPLCONFIGDIR=/tmp/matplotlib",
            "-e", "XDG_CONFIG_HOME=/tmp",
            "-v", f"{script_path.resolve()}:/workspace/script.py:ro",
            "-v", f"{output_dir.resolve()}:/workspace/output:rw",
            "-w", "/workspace/output",  # BUG-60: CWD = output dir so relative paths work
            "--user", f"{os.getuid()}:{os.getgid()}",
            "--entrypoint", "python3",
            self._docker_image,
            "/workspace/script.py",
        ]

        try:
            proc = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=self._timeout,
                check=False,
            )
        except subprocess.TimeoutExpired:
            # Kill the container on timeout
            try:
                subprocess.run(
                    ["docker", "kill", container_name],
                    capture_output=True,
                    timeout=10,
                    check=False,
                )
            except (FileNotFoundError, subprocess.TimeoutExpired):
                pass
            return {"error": f"Docker script timed out after {self._timeout}s"}
        except FileNotFoundError:
            return {"error": "Docker executable not found"}
        except Exception as exc:
            return {"error": f"Docker execution error: {exc}"}

        if proc.returncode != 0:
            stderr = proc.stderr[:2000] if proc.stderr else "Unknown error"
            return {"error": f"Docker script failed (exit {proc.returncode}): {stderr}"}

        return {"error": ""}


================================================
FILE: researchclaw/agents/figure_agent/style_config.py
================================================
"""Academic chart styling configuration for FigureAgent.

Defines global constants for chart styling that conform to AI conference
publication standards (IEEE, NeurIPS, ICML, ICLR).  Used by CodeGen Agent
when generating matplotlib plotting scripts.
"""

from __future__ import annotations

# ---------------------------------------------------------------------------
# Style presets
# ---------------------------------------------------------------------------

# SciencePlots style list — CodeGen Agent inserts this into generated scripts.
# Fallback: seaborn-v0_8-whitegrid if SciencePlots is not installed.
MATPLOTLIB_STYLES = ["science", "ieee"]
MATPLOTLIB_STYLES_FALLBACK = ["seaborn-v0_8-whitegrid"]

# Output resolution (DPI) — 300+ for publication, 150 for draft
DPI_PUBLICATION = 300
DPI_DRAFT = 150

# ---------------------------------------------------------------------------
# Font sizes (points) — width-aware to avoid oversized text in paper columns
# ---------------------------------------------------------------------------

# For single-column figures (≤3.5in) — fonts must be small to match 10pt body
FONT_SIZE_SINGLE_COL = {
    "title": 9,
    "axis_label": 8,
    "tick": 7,
    "legend": 7,
    "annotation": 7,
}

# For double-column / full-page figures (≥7.0in) — normal academic sizes
FONT_SIZE_DOUBLE_COL = {
    "title": 11,
    "axis_label": 10,
    "tick": 9,
    "legend": 9,
    "annotation": 9,
}

# Legacy alias (default to single-column, the most common case)
FONT_SIZE = FONT_SIZE_SINGLE_COL


def get_font_sizes(width_key: str = "single_column") -> dict[str, int]:
    """Return font size dict appropriate for the given figure width."""
    if width_key in ("double_column", "full_page"):
        return FONT_SIZE_DOUBLE_COL
    return FONT_SIZE_SINGLE_COL

# ---------------------------------------------------------------------------
# Figure dimensions (inches) — column-width aware
# ---------------------------------------------------------------------------

FIGURE_WIDTH = {
    "single_column": 3.5,   # IEEE / NeurIPS single column
    "double_column": 7.0,   # IEEE / NeurIPS double column
    "full_page": 7.0,       # Full width
}

DEFAULT_FIGURE_HEIGHT = 3.0  # reasonable default height

# ---------------------------------------------------------------------------
# Colorblind-safe palette (Paul Tol's "bright" scheme)
# ---------------------------------------------------------------------------

COLORS_BRIGHT = [
    "#4477AA",  # blue
    "#EE6677",  # red
    "#228833",  # green
    "#CCBB44",  # yellow
    "#66CCEE",  # cyan
    "#AA3377",  # purple
    "#BBBBBB",  # grey
]

# Extended palette for > 7 categories
COLORS_EXTENDED = COLORS_BRIGHT + [
    "#332288",  # indigo
    "#88CCEE",  # light blue
    "#44AA99",  # teal
    "#117733",  # dark green
    "#999933",  # olive
    "#CC6677",  # rose
    "#882255",  # wine
]

# ---------------------------------------------------------------------------
# Line and marker styles (for B&W printing compatibility)
# ---------------------------------------------------------------------------

LINE_STYLES = ["-", "--", "-.", ":"]
MARKER_STYLES = ["o", "s", "^", "D", "v", "P", "*", "X"]

# ---------------------------------------------------------------------------
# Output format preferences
# ---------------------------------------------------------------------------

OUTPUT_FORMAT_PRIMARY = "pdf"      # Vector — preferred for publication
OUTPUT_FORMAT_FALLBACK = "png"     # Raster — for markdown embedding
OUTPUT_FORMATS = ["pdf", "png"]    # Generate both

# ---------------------------------------------------------------------------
# Chart type constants
# ---------------------------------------------------------------------------

CHART_TYPES = {
    "bar_comparison",
    "grouped_bar",
    "training_curve",
    "loss_curve",
    "heatmap",
    "confusion_matrix",
    "scatter_plot",
    "violin_box",
    "ablation_grouped",
    "line_multi",
    "radar_chart",
    "architecture_diagram",  # Placeholder — generated via description
}

# ---------------------------------------------------------------------------
# Style snippet for injection into generated scripts
# ---------------------------------------------------------------------------

STYLE_PREAMBLE = '''
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

# Academic styling
try:
    plt.style.use({styles})
except Exception:
    try:
        plt.style.use({fallback})
    except Exception:
        pass  # Use default matplotlib style

# Colorblind-safe palette
COLORS = {colors}
LINE_STYLES = {line_styles}
MARKERS = {markers}

# Publication settings
plt.rcParams.update({{
    "font.size": {font_axis},
    "axes.titlesize": {font_title},
    "axes.labelsize": {font_axis},
    "xtick.labelsize": {font_tick},
    "ytick.labelsize": {font_tick},
    "legend.fontsize": {font_legend},
    "figure.dpi": {dpi},
    "savefig.dpi": {dpi},
    "savefig.bbox": "tight",
    "savefig.pad_inches": 0.15,
}})
'''.strip()


def get_style_preamble(
    *,
    dpi: int = DPI_PUBLICATION,
    width_key: str = "single_column",
) -> str:
    """Return the style preamble string for injection into chart scripts."""
    fonts = get_font_sizes(width_key)
    return STYLE_PREAMBLE.format(
        styles=repr(MATPLOTLIB_STYLES),
        fallback=repr(MATPLOTLIB_STYLES_FALLBACK),
        colors=repr(COLORS_BRIGHT),
        line_styles=repr(LINE_STYLES),
        markers=repr(MARKER_STYLES),
        font_title=fonts["title"],
        font_axis=fonts["axis_label"],
        font_tick=fonts["tick"],
        font_legend=fonts["legend"],
        dpi=dpi,
    )


================================================
FILE: researchclaw/assessor/__init__.py
================================================
"""Paper quality assessment and venue recommendation."""

from researchclaw.assessor.rubrics import RUBRICS, Rubric
from researchclaw.assessor.scorer import PaperScorer
from researchclaw.assessor.venue_recommender import VenueRecommender
from researchclaw.assessor.comparator import HistoryComparator

__all__ = [
    "RUBRICS",
    "HistoryComparator",
    "PaperScorer",
    "Rubric",
    "VenueRecommender",
]


================================================
FILE: researchclaw/assessor/comparator.py
================================================
"""Historical score comparison and tracking."""

from __future__ import annotations

import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


class HistoryComparator:
    """Track and compare paper quality scores across runs."""

    def __init__(self, history_dir: Path | None = None):
        self._history_dir = history_dir
        self._entries: list[dict[str, Any]] = []
        if history_dir:
            self._load_history()

    def _load_history(self) -> None:
        """Load score history from disk."""
        if self._history_dir is None:
            return
        history_file = self._history_dir / "quality_history.json"
        if not history_file.exists():
            return
        try:
            data = json.loads(history_file.read_text(encoding="utf-8"))
            if isinstance(data, list):
                self._entries = data
        except (json.JSONDecodeError, OSError) as exc:
            logger.warning("Failed to load quality history: %s", exc)

    def record(
        self,
        run_id: str,
        topic: str,
        scores: dict[str, Any],
    ) -> None:
        """Record a quality assessment result."""
        entry = {
            "run_id": run_id,
            "topic": topic,
            "overall": scores.get("overall", 0.0),
            "scores": scores.get("scores", {}),
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }
        self._entries.append(entry)
        self._save_history()

    def _save_history(self) -> None:
        """Persist history to disk."""
        if self._history_dir is None:
            return
        self._history_dir.mkdir(parents=True, exist_ok=True)
        history_file = self._history_dir / "quality_history.json"
        history_file.write_text(
            json.dumps(self._entries, indent=2), encoding="utf-8"
        )

    def compare(
        self,
        current_scores: dict[str, Any],
        previous_run_id: str | None = None,
    ) -> dict[str, Any]:
        """Compare current scores with a previous run or best historical."""
        if not self._entries:
            return {
                "comparison": "no_history",
                "message": "No previous runs to compare against.",
            }

        if previous_run_id:
            prev = next(
                (e for e in self._entries if e["run_id"] == previous_run_id),
                None,
            )
        else:
            prev = max(self._entries, key=lambda e: e.get("overall", 0))

        if prev is None:
            return {
                "comparison": "not_found",
                "message": f"Run '{previous_run_id}' not found in history.",
            }

        current_overall = current_scores.get("overall", 0.0)
        prev_overall = prev.get("overall", 0.0)
        delta = round(current_overall - prev_overall, 2)

        dim_deltas = {}
        current_dims = current_scores.get("scores", {})
        prev_dims = prev.get("scores", {})
        for dim in set(current_dims) | set(prev_dims):
            cur = current_dims.get(dim, 0.0)
            prv = prev_dims.get(dim, 0.0)
            dim_deltas[dim] = round(cur - prv, 2)

        trend = "improved" if delta > 0.5 else ("declined" if delta < -0.5 else "stable")

        return {
            "comparison": "success",
            "previous_run_id": prev.get("run_id", "unknown"),
            "current_overall": current_overall,
            "previous_overall": prev_overall,
            "delta": delta,
            "trend": trend,
            "dimension_deltas": dim_deltas,
        }

    def get_best_run(self) -> dict[str, Any] | None:
        """Return the highest-scoring historical run."""
        if not self._entries:
            return None
        return max(self._entries, key=lambda e: e.get("overall", 0))

    def get_history(self) -> list[dict[str, Any]]:
        """Return all historical entries."""
        return list(self._entries)


================================================
FILE: researchclaw/assessor/rubrics.py
================================================
"""Paper quality assessment rubrics."""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class Rubric:
    """A single evaluation dimension rubric."""

    name: str
    criteria: str
    scale: str
    weight: float = 1.0


RUBRICS: dict[str, Rubric] = {
    "novelty": Rubric(
        name="Novelty",
        criteria="Originality of the idea. Does it propose something genuinely new?",
        scale="1=rehash, 3=incremental, 5=solid contribution, 7=novel, 10=breakthrough",
    ),
    "rigor": Rubric(
        name="Rigor",
        criteria=(
            "Scientific rigor. Are experiments well-designed? "
            "Statistical significance reported?"
        ),
        scale="1=no experiments, 3=basic, 5=adequate, 7=thorough, 10=exemplary",
    ),
    "clarity": Rubric(
        name="Clarity",
        criteria="Writing quality. Is the paper well-organized and easy to follow?",
        scale="1=incomprehensible, 3=poor, 5=adequate, 7=clear, 10=excellent",
    ),
    "impact": Rubric(
        name="Impact",
        criteria="Potential impact on the field. Will others cite/use this work?",
        scale="1=none, 3=limited, 5=moderate, 7=significant, 10=transformative",
    ),
    "experiments": Rubric(
        name="Experiments",
        criteria="Experimental sufficiency. Are baselines fair? Ablations complete?",
        scale="1=none, 3=minimal, 5=adequate, 7=comprehensive, 10=exceptional",
    ),
}


================================================
FILE: researchclaw/assessor/scorer.py
================================================
"""Multi-dimensional paper quality scorer."""

from __future__ import annotations

import json
import logging
import re
from typing import Any

from researchclaw.assessor.rubrics import RUBRICS, Rubric

logger = logging.getLogger(__name__)


class PaperScorer:
    """Score a paper across multiple quality dimensions using an LLM."""

    def __init__(
        self,
        dimensions: tuple[str, ...] | None = None,
        llm_client: Any = None,
    ):
        self.dimensions = dimensions or tuple(RUBRICS.keys())
        self.llm = llm_client

    async def score(
        self,
        paper_md: str,
        experiment_results: dict[str, Any] | None = None,
    ) -> dict[str, Any]:
        """Score a paper across all configured dimensions."""
        scores: dict[str, float] = {}
        details: dict[str, str] = {}

        for dim in self.dimensions:
            rubric = RUBRICS.get(dim)
            if rubric is None:
                logger.warning("Unknown rubric dimension: %s", dim)
                continue
            score, detail = await self._score_dimension(
                paper_md, experiment_results, rubric
            )
            scores[dim] = score
            details[dim] = detail

        if scores:
            total_weight = sum(
                RUBRICS[d].weight for d in scores if d in RUBRICS
            )
            if total_weight > 0:
                weighted_sum = sum(
                    scores[d] * RUBRICS[d].weight
                    for d in scores
                    if d in RUBRICS
                )
                overall = round(weighted_sum / total_weight, 2)
            else:
                overall = round(sum(scores.values()) / len(scores), 2)
        else:
            overall = 0.0

        return {
            "scores": scores,
            "overall": overall,
            "details": details,
            "dimensions_evaluated": list(scores.keys()),
        }

    async def _score_dimension(
        self,
        paper_md: str,
        experiment_results: dict[str, Any] | None,
        rubric: Rubric,
    ) -> tuple[float, str]:
        """Score a single dimension using the LLM."""
        if self.llm is None:
            return self._heuristic_score(paper_md, rubric)

        exp_context = ""
        if experiment_results:
            exp_context = f"\n\nExperiment results summary:\n{json.dumps(experiment_results, indent=2, default=str)[:2000]}"

        prompt = (
            f"Rate the following research paper on '{rubric.name}' from 1 to 10.\n\n"
            f"Criteria: {rubric.criteria}\n"
            f"Scale: {rubric.scale}\n\n"
            f"Paper content (first 6000 chars):\n{paper_md[:6000]}"
            f"{exp_context}\n\n"
            f"Respond in this exact format:\n"
            f"SCORE: <number 1-10>\n"
            f"REASON: <one sentence explanation>"
        )

        try:
            response = await self.llm.chat_async(prompt)
            return self._parse_score_response(response, rubric.name)
        except Exception as exc:
            logger.warning("LLM scoring failed for %s: %s", rubric.name, exc)
            return self._heuristic_score(paper_md, rubric)

    @staticmethod
    def _parse_score_response(
        response: str,
        dim_name: str,
    ) -> tuple[float, str]:
        """Parse LLM score response."""
        score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", response)
        reason_match = re.search(r"REASON:\s*(.+)", response)

        if score_match:
            score = float(score_match.group(1))
            score = max(1.0, min(10.0, score))
        else:
            score = 5.0

        reason = reason_match.group(1).strip() if reason_match else "No detail provided"
        return score, reason

    @staticmethod
    def _heuristic_score(
        paper_md: str,
        rubric: Rubric,
    ) -> tuple[float, str]:
        """Simple heuristic scoring when LLM is unavailable."""
        word_count = len(paper_md.split())

        if rubric.name == "Clarity":
            if word_count > 3000:
                score = 6.0
            elif word_count > 1000:
                score = 5.0
            else:
                score = 3.0
            return score, f"Heuristic: {word_count} words"

        if rubric.name == "Experiments":
            has_table = "table" in paper_md.lower() or "|" in paper_md
            has_figure = "figure" in paper_md.lower() or "fig." in paper_md.lower()
            score = 4.0
            if has_table:
                score += 1.5
            if has_figure:
                score += 1.5
            return min(score, 10.0), "Heuristic: table/figure presence"

        return 5.0, "Heuristic: default score (no LLM)"


================================================
FILE: researchclaw/assessor/venue_recommender.py
================================================
"""Venue recommendation based on paper quality scores."""

from __future__ import annotations

from typing import Any


class VenueRecommender:
    """Recommend submission venues based on quality scores."""

    VENUE_TIERS: dict[str, dict[str, Any]] = {
        "tier_1": {
            "venues": ["NeurIPS", "ICML", "ICLR", "CVPR", "ACL"],
            "min_score": 8.0,
            "domains": {
                "NeurIPS": ["ml", "ai", "deep-learning"],
                "ICML": ["ml"],
                "ICLR": ["ml", "deep-learning", "representation-learning"],
                "CVPR": ["cv", "deep-learning"],
                "ACL": ["nlp", "ai"],
            },
        },
        "tier_2": {
            "venues": ["AAAI", "IJCAI", "ECCV", "EMNLP", "KDD"],
            "min_score": 6.0,
            "domains": {
                "AAAI": ["ai", "ml"],
                "IJCAI": ["ai", "ml"],
                "ECCV": ["cv", "deep-learning"],
                "EMNLP": ["nlp"],
                "KDD": ["data-mining", "ml"],
            },
        },
        "tier_3": {
            "venues": ["ACML", "AISTATS", "WACV", "COLING"],
            "min_score": 4.0,
            "domains": {
                "ACML": ["ml"],
                "AISTATS": ["ml", "statistics"],
                "WACV": ["cv"],
                "COLING": ["nlp"],
            },
        },
        "workshop": {
            "venues": ["NeurIPS Workshop", "ICML Workshop", "ICLR Workshop"],
            "min_score": 3.0,
            "domains": {
                "NeurIPS Workshop": ["ml", "ai"],
                "ICML Workshop": ["ml"],
                "ICLR Workshop": ["ml", "deep-learning"],
            },
        },
    }

    def recommend(
        self,
        scores: dict[str, Any],
        domains: list[str] | None = None,
    ) -> list[dict[str, Any]]:
        """Recommend venues based on paper scores."""
        overall = scores.get("overall", 0.0)
        if not isinstance(overall, (int, float)):
            overall = 0.0

        recommendations: list[dict[str, Any]] = []

        for tier_name, tier_data in self.VENUE_TIERS.items():
            min_score = tier_data["min_score"]
            if overall < min_score:
                continue

            for venue in tier_data["venues"]:
                venue_domains = tier_data["domains"].get(venue, [])
                if domains and not any(d in venue_domains for d in domains):
                    continue

                recommendations.append({
                    "venue": venue,
                    "tier": tier_name,
                    "match_score": overall,
                    "suggestion": self._get_suggestion(venue, scores),
                    "venue_domains": venue_domains,
                })

        recommendations.sort(
            key=lambda r: (
                -r["match_score"],
                {"tier_1": 0, "tier_2": 1, "tier_3": 2, "workshop": 3}.get(
                    r["tier"], 4
                ),
            )
        )
        return recommendations

    @staticmethod
    def _get_suggestion(venue: str, scores: dict[str, Any]) -> str:
        """Generate a brief suggestion for improving chances at this venue."""
        dim_scores = scores.get("scores", {})
        if not dim_scores:
            return "Evaluate paper quality to get specific suggestions."

        weakest_dim = min(dim_scores, key=dim_scores.get)
        weakest_score = dim_scores[weakest_dim]

        if weakest_score < 5:
            return f"Strengthen '{weakest_dim}' (currently {weakest_score}/10) before submitting."
        if weakest_score < 7:
            return f"Consider improving '{weakest_dim}' ({weakest_score}/10) for better chances."
        return "Paper quality looks strong for this venue."

    def format_recommendations(
        self,
        recommendations: list[dict[str, Any]],
    ) -> str:
        """Format recommendations as a readable string."""
        if not recommendations:
            return "No suitable venues found for current paper quality."

        lines = ["Venue Recommendations:", ""]
        for rec in recommendations:
            lines.append(
                f"  {rec['venue']} ({rec['tier']}) — "
                f"score {rec['match_score']}/10"
            )
            lines.append(f"    {rec['suggestion']}")
            lines.append("")
        return "\n".join(lines)


================================================
FILE: researchclaw/calendar/__init__.py
================================================
"""Conference deadline calendar and submission planning."""

from researchclaw.calendar.deadlines import ConferenceCalendar
from researchclaw.calendar.planner import SubmissionPlanner
from researchclaw.calendar.reminder import ReminderCalculator

__all__ = [
    "ConferenceCalendar",
    "ReminderCalculator",
    "SubmissionPlanner",
]


================================================
FILE: researchclaw/calendar/data/conferences.yaml
================================================
conferences:
  - name: "NeurIPS 2026"
    full_name: "Conference on Neural Information Processing Systems"
    domains: ["ml", "ai", "deep-learning"]
    abstract_deadline: "2026-05-15"
    paper_deadline: "2026-05-22"
    notification: "2026-09-25"
    camera_ready: "2026-10-15"
    conference_date: "2026-12-08"
    url: "https://neurips.cc/"
    tier: 1

  - name: "ICLR 2027"
    full_name: "International Conference on Learning Representations"
    domains: ["ml", "deep-learning", "representation-learning"]
    abstract_deadline: "2026-09-28"
    paper_deadline: "2026-10-05"
    notification: "2027-01-22"
    conference_date: "2027-04-28"
    url: "https://iclr.cc/"
    tier: 1

  - name: "ICML 2026"
    full_name: "International Conference on Machine Learning"
    domains: ["ml"]
    paper_deadline: "2026-01-31"
    notification: "2026-05-01"
    conference_date: "2026-07-21"
    url: "https://icml.cc/"
    tier: 1

  - name: "CVPR 2027"
    full_name: "Conference on Computer Vision and Pattern Recognition"
    domains: ["cv", "deep-learning"]
    paper_deadline: "2026-11-15"
    notification: "2027-02-25"
    conference_date: "2027-06-15"
    url: "https://cvpr.thecvf.com/"
    tier: 1

  - name: "ACL 2026"
    full_name: "Annual Meeting of the Association for Computational Linguistics"
    domains: ["nlp", "ai"]
    paper_deadline: "2026-02-15"
    notification: "2026-05-10"
    conference_date: "2026-08-10"
    url: "https://www.aclweb.org/"
    tier: 1

  - name: "AAAI 2027"
    full_name: "AAAI Conference on Artificial Intelligence"
    domains: ["ai", "ml"]
    abstract_deadline: "2026-08-08"
    paper_deadline: "2026-08-15"
    notification: "2026-11-20"
    conference_date: "2027-02-22"
    url: "https://aaai.org/"
    tier: 1

  - name: "IJCAI 2026"
    full_name: "International Joint Conference on Artificial Intelligence"
    domains: ["ai", "ml"]
    paper_deadline: "2026-01-17"
    notification: "2026-04-20"
    conference_date: "2026-08-09"
    url: "https://www.ijcai.org/"
    tier: 1

  - name: "ECCV 2026"
    full_name: "European Conference on Computer Vision"
    domains: ["cv", "deep-learning"]
    paper_deadline: "2026-03-07"
    notification: "2026-07-01"
    conference_date: "2026-10-05"
    url: "https://eccv.ecva.net/"
    tier: 1

  - name: "EMNLP 2026"
    full_name: "Conference on Empirical Methods in Natural Language Processing"
    domains: ["nlp"]
    paper_deadline: "2026-06-01"
    notification: "2026-08-15"
    conference_date: "2026-12-01"
    url: "https://www.aclweb.org/"
    tier: 1

  - name: "AISTATS 2027"
    full_name: "International Conference on Artificial Intelligence and Statistics"
    domains: ["ml", "statistics"]
    paper_deadline: "2026-10-10"
    notification: "2027-01-15"
    conference_date: "2027-04-15"
    url: "https://aistats.org/"
    tier: 2

  - name: "ACML 2026"
    full_name: "Asian Conference on Machine Learning"
    domains: ["ml"]
    paper_deadline: "2026-06-15"
    notification: "2026-08-30"
    conference_date: "2026-11-15"
    url: "https://www.acml-conf.org/"
    tier: 2

  - name: "WACV 2027"
    full_name: "Winter Conference on Applications of Computer Vision"
    domains: ["cv"]
    paper_deadline: "2026-08-01"
    notification: "2026-10-20"
    conference_date: "2027-01-06"
    url: "https://wacv2027.thecvf.com/"
    tier: 2

  - name: "COLING 2026"
    full_name: "International Conference on Computational Linguistics"
    domains: ["nlp"]
    paper_deadline: "2026-05-10"
    notification: "2026-07-15"
    conference_date: "2026-10-20"
    url: "https://coling2026.org/"
    tier: 2

  - name: "NAACL 2026"
    full_name: "North American Chapter of the ACL"
    domains: ["nlp"]
    paper_deadline: "2026-01-20"
    notification: "2026-03-30"
    conference_date: "2026-06-15"
    url: "https://www.aclweb.org/"
    tier: 1

  - name: "KDD 2026"
    full_name: "ACM SIGKDD Conference on Knowledge Discovery and Data Mining"
    domains: ["data-mining", "ml"]
    paper_deadline: "2026-02-08"
    notification: "2026-05-20"
    conference_date: "2026-08-03"
    url: "https://kdd.org/"
    tier: 1


================================================
FILE: researchclaw/calendar/deadlines.py
================================================
"""Conference deadline data management."""

from __future__ import annotations

import logging
from dataclasses import dataclass
from datetime import date, datetime
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)

_DATA_DIR = Path(__file__).parent / "data"


@dataclass(frozen=True)
class Conference:
    """A single conference entry."""

    name: str
    full_name: str
    domains: tuple[str, ...]
    tier: int
    url: str = ""
    abstract_deadline: date | None = None
    paper_deadline: date | None = None
    notification: date | None = None
    camera_ready: date | None = None
    conference_date: date | None = None

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Conference:
        """Parse a conference from a YAML dict."""
        def _parse_date(val: Any) -> date | None:
            if val is None:
                return None
            if isinstance(val, date):
                return val
            return datetime.strptime(str(val), "%Y-%m-%d").date()

        return cls(
            name=str(data["name"]),
            full_name=str(data.get("full_name", data["name"])),
            domains=tuple(data.get("domains") or ()),
            tier=int(data.get("tier", 3)),
            url=str(data.get("url", "")),
            abstract_deadline=_parse_date(data.get("abstract_deadline")),
            paper_deadline=_parse_date(data.get("paper_deadline")),
            notification=_parse_date(data.get("notification")),
            camera_ready=_parse_date(data.get("camera_ready")),
            conference_date=_parse_date(data.get("conference_date")),
        )

    @property
    def next_deadline(self) -> date | None:
        """Return the earliest upcoming deadline (abstract or paper)."""
        today = date.today()
        candidates = []
        if self.abstract_deadline and self.abstract_deadline >= today:
            candidates.append(self.abstract_deadline)
        if self.paper_deadline and self.paper_deadline >= today:
            candidates.append(self.paper_deadline)
        return min(candidates) if candidates else None

    @property
    def days_until_deadline(self) -> int | None:
        """Days until the next deadline, or None if all passed."""
        nd = self.next_deadline
        if nd is None:
            return None
        return (nd - date.today()).days


class ConferenceCalendar:
    """Manage conference deadline data."""

    def __init__(self, conferences: list[Conference] | None = None):
        self._conferences: list[Conference] = conferences or []

    @classmethod
    def load_builtin(cls) -> ConferenceCalendar:
        """Load the built-in conferences.yaml data."""
        yaml_path = _DATA_DIR / "conferences.yaml"
        if not yaml_path.exists():
            logger.warning("Built-in conferences.yaml not found at %s", yaml_path)
            return cls([])
        return cls.load(yaml_path)

    @classmethod
    def load(cls, path: Path | str) -> ConferenceCalendar:
        """Load conferences from a YAML file."""
        path = Path(path)
        with path.open(encoding="utf-8") as f:
            data = yaml.safe_load(f) or {}
        entries = data.get("conferences", [])
        conferences = []
        for entry in entries:
            try:
                conferences.append(Conference.from_dict(entry))
            except (KeyError, ValueError, TypeError) as exc:
                logger.warning("Skipping invalid conference entry: %s", exc)
        return cls(conferences)

    @property
    def conferences(self) -> list[Conference]:
        return list(self._conferences)

    def get_upcoming(
        self,
        domains: list[str] | None = None,
        days: int = 90,
        tier: int | None = None,
    ) -> list[Conference]:
        """Get conferences with deadlines in the next N days."""
        today = date.today()
        results = []
        for conf in self._conferences:
            nd = conf.next_deadline
            if nd is None:
                continue
            delta = (nd - today).days
            if delta < 0 or delta > days:
                continue
            if domains and not any(d in conf.domains for d in domains):
                continue
            if tier is not None and conf.tier > tier:
                continue
            results.append(conf)
        results.sort(key=lambda c: c.next_deadline or date.max)
        return results

    def get_by_name(self, name: str) -> Conference | None:
        """Find a conference by name (case-insensitive partial match)."""
        name_lower = name.lower()
        for conf in self._conferences:
            if name_lower in conf.name.lower():
                return conf
        return None

    def get_by_domain(self, domain: str) -> list[Conference]:
        """Get all conferences for a domain."""
        return [c for c in self._conferences if domain in c.domains]

    def format_upcoming(
        self,
        domains: list[str] | None = None,
        days: int = 90,
    ) -> str:
        """Format upcoming deadlines as a readable string."""
        upcoming = self.get_upcoming(domains=domains, days=days)
        if not upcoming:
            return "No upcoming deadlines in the next {} days.".format(days)
        lines = [f"Upcoming Conference Deadlines (next {days} days):", ""]
        for conf in upcoming:
            nd = conf.next_deadline
            days_left = conf.days_until_deadline
            dl_type = "abstract" if nd == conf.abstract_deadline else "paper"
            lines.append(
                f"  {conf.name} (Tier {conf.tier})"
            )
            lines.append(
                f"    {dl_type} deadline: {nd} ({days_left} days left)"
            )
            if conf.url:
                lines.append(f"    URL: {conf.url}")
            lines.append("")
        return "\n".join(lines)


================================================
FILE: researchclaw/calendar/planner.py
================================================
"""Submission timeline planner."""

from __future__ import annotations

from datetime import date, timedelta
from typing import Any

from researchclaw.calendar.deadlines import ConferenceCalendar


class SubmissionPlanner:
    """Generate submission timelines for target conferences."""

    # Stage proportions of total available time
    STAGE_PROPORTIONS = [
        ("Topic Selection", 0.0),
        ("Literature Review", 0.10),
        ("Experiment Design", 0.20),
        ("Experiments", 0.40),
        ("Paper Writing", 0.60),
        ("Revision", 0.80),
        ("Final Check", 0.95),
        ("Submission", 1.0),
    ]

    def __init__(self, calendar: ConferenceCalendar):
        self.calendar = calendar

    def plan(
        self,
        target_venue: str,
        start_date: date | None = None,
    ) -> dict[str, Any]:
        """Generate a submission timeline for a target venue."""
        conf = self.calendar.get_by_name(target_venue)
        if conf is None:
            return {"error": f"Conference '{target_venue}' not found"}

        deadline = conf.paper_deadline or conf.abstract_deadline
        if deadline is None:
            return {"error": f"No deadline found for '{conf.name}'"}

        start = start_date or date.today()
        total_days = (deadline - start).days

        if total_days <= 0:
            return {
                "error": f"Deadline {deadline} has passed",
                "venue": conf.name,
                "deadline": deadline.isoformat(),
            }

        milestones = []
        for stage_name, proportion in self.STAGE_PROPORTIONS:
            offset = int(total_days * proportion)
            milestone_date = start + timedelta(days=offset)
            days_left = (deadline - milestone_date).days
            milestones.append({
                "stage": stage_name,
                "date": milestone_date.isoformat(),
                "days_left": days_left,
            })

        return {
            "venue": conf.name,
            "deadline": deadline.isoformat(),
            "total_days": total_days,
            "start_date": start.isoformat(),
            "milestones": milestones,
            "conference_url": conf.url,
            "tier": conf.tier,
        }

    def format_plan(
        self,
        target_venue: str,
        start_date: date | None = None,
    ) -> str:
        """Format a submission plan as a readable string."""
        plan = self.plan(target_venue, start_date)
        if "error" in plan:
            return f"Error: {plan['error']}"

        lines = [
            f"Submission Plan for {plan['venue']}",
            f"Deadline: {plan['deadline']} ({plan['total_days']} days from start)",
            "",
            "Milestones:",
        ]
        for ms in plan["milestones"]:
            lines.append(
                f"  [{ms['date']}] {ms['stage']} ({ms['days_left']} days left)"
            )
        return "\n".join(lines)


================================================
FILE: researchclaw/calendar/reminder.py
================================================
"""Deadline reminder calculation."""

from __future__ import annotations

from dataclasses import dataclass
from datetime import date
from typing import Any

from researchclaw.calendar.deadlines import Conference


@dataclass(frozen=True)
class Reminder:
    """A deadline reminder."""

    conference_name: str
    deadline_type: str  # "abstract" or "paper"
    deadline_date: date
    days_until: int
    urgency: str  # "critical" | "warning" | "info"


class ReminderCalculator:
    """Calculate deadline reminders based on configuration."""

    def __init__(
        self,
        reminder_days: tuple[int, ...] = (30, 14, 7, 3, 1),
    ):
        self.reminder_days = sorted(reminder_days, reverse=True)

    def check(
        self,
        conferences: list[Conference],
        check_date: date | None = None,
    ) -> list[Reminder]:
        """Check which conferences need reminders today."""
        today = check_date or date.today()
        reminders: list[Reminder] = []

        for conf in conferences:
            for dl_type, dl_date in [
                ("abstract", conf.abstract_deadline),
                ("paper", conf.paper_deadline),
            ]:
                if dl_date is None:
                    continue
                days_until = (dl_date - today).days
                if days_until < 0:
                    continue
                if days_until in self.reminder_days:
                    urgency = self._classify_urgency(days_until)
                    reminders.append(Reminder(
                        conference_name=conf.name,
                        deadline_type=dl_type,
                        deadline_date=dl_date,
                        days_until=days_until,
                        urgency=urgency,
                    ))

        reminders.sort(key=lambda r: r.days_until)
        return reminders

    def get_active_reminders(
        self,
        conferences: list[Conference],
        check_date: date | None = None,
    ) -> list[Reminder]:
        """Get all reminders for deadlines within the reminder window."""
        today = check_date or date.today()
        max_days = max(self.reminder_days) if self.reminder_days else 30
        reminders: list[Reminder] = []

        for conf in conferences:
            for dl_type, dl_date in [
                ("abstract", conf.abstract_deadline),
                ("paper", conf.paper_deadline),
            ]:
                if dl_date is None:
                    continue
                days_until = (dl_date - today).days
                if 0 <= days_until <= max_days:
                    urgency = self._classify_urgency(days_until)
                    reminders.append(Reminder(
                        conference_name=conf.name,
                        deadline_type=dl_type,
                        deadline_date=dl_date,
                        days_until=days_until,
                        urgency=urgency,
                    ))

        reminders.sort(key=lambda r: r.days_until)
        return reminders

    @staticmethod
    def _classify_urgency(days_until: int) -> str:
        if days_until <= 3:
            return "critical"
        if days_until <= 14:
            return "warning"
        return "info"

    def format_reminders(self, reminders: list[Reminder]) -> str:
        """Format reminders as a readable string."""
        if not reminders:
            return "No upcoming deadline reminders."
        lines = ["Deadline Reminders:", ""]
        for r in reminders:
            icon = {"critical": "!!!", "warning": "!!", "info": "i"}[r.urgency]
            lines.append(
                f"  [{icon}] {r.conference_name} — {r.deadline_type} deadline "
                f"in {r.days_until} days ({r.deadline_date})"
            )
        return "\n".join(lines)


================================================
FILE: researchclaw/cli.py
================================================
"""ResearchClaw CLI — run the 23-stage autonomous research pipeline."""

from __future__ import annotations

import argparse
import hashlib
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from collections.abc import Mapping
from typing import cast

from researchclaw.adapters import AdapterBundle
from researchclaw.config import (
    CONFIG_SEARCH_ORDER,
    EXAMPLE_CONFIG,
    RCConfig,
    resolve_config_path,
)
from researchclaw.health import print_doctor_report, run_doctor, write_doctor_report


# ---------------------------------------------------------------------------
# OpenCode installation helpers
# ---------------------------------------------------------------------------

def _is_opencode_installed() -> bool:
    """Check if the ``opencode`` CLI is available on PATH."""
    opencode_cmd = shutil.which("opencode")
    if opencode_cmd is None:
        return False
    try:
        r = subprocess.run(
            [opencode_cmd, "--version"],
            capture_output=True, text=True, timeout=15,
        )
        return r.returncode == 0
    except Exception:  # noqa: BLE001
        return False


def _is_npm_installed() -> bool:
    """Check if ``npm`` is available on PATH."""
    return shutil.which("npm") is not None


def _install_opencode() -> bool:
    """Install OpenCode globally via npm.  Returns True on success."""
    print("  Installing opencode-ai (this may take a minute)...")
    npm_cmd = shutil.which("npm")
    if not npm_cmd:
        print("  npm is not installed. Cannot install OpenCode.")
        return False
    try:
        r = subprocess.run(
            [npm_cmd, "i", "-g", "opencode-ai@latest"],
            capture_output=True, text=True, timeout=120,
        )
        if r.returncode == 0:
            print("  OpenCode installed successfully!")
            return True
        else:
            print(f"  Installation failed (exit {r.returncode}):")
            if r.stderr:
                for line in r.stderr.strip().splitlines()[:5]:
                    print(f"    {line}")
            return False
    except subprocess.TimeoutExpired:
        print("  Installation timed out.")
        return False
    except Exception as exc:  # noqa: BLE001
        print(f"  Installation failed: {exc}")
        return False


def _prompt_opencode_install() -> bool:
    """Interactively prompt the user to install OpenCode.

    Returns True if OpenCode is now available (already installed or
    just installed successfully).  Returns False otherwise.
    """
    if _is_opencode_installed():
        return True

    if not sys.stdin.isatty():
        return False

    print()
    print("=" * 60)
    print("  OpenCode Beast Mode  (Recommended)")
    print("=" * 60)
    print()
    print("  OpenCode is an AI coding agent that dramatically improves")
    print("  experiment code generation for complex research tasks.")
    print()
    print("  With OpenCode enabled, ResearchClaw can generate multi-file")
    print("  experiment projects with custom architectures, training")
    print("  loops, and ablation studies — far beyond single-file limits.")
    print()

    if not _is_npm_installed():
        print("  Node.js/npm is required but not installed.")
        print("  To install OpenCode later:")
        print("    1. Install Node.js: https://nodejs.org/")
        print("    2. Run: npm i -g opencode-ai@latest")
        print("    — or: researchclaw setup")
        print()
        return False

    try:
        answer = input("  Install OpenCode now? [Y/n]: ").strip().lower()
    except (EOFError, KeyboardInterrupt):
        print()
        return False

    if answer in ("", "y", "yes"):
        success = _install_opencode()
        if not success:
            print("  You can retry later with: researchclaw setup")
        return success
    else:
        print("  Skipped. You can install later with: researchclaw setup")
        return False


def _resolve_config_or_exit(args: argparse.Namespace) -> Path | None:
    """Resolve config path from args, printing helpful errors on failure.

    Returns the resolved Path on success, or None if the config cannot be found
    (after printing an error message to stderr).
    """
    path = resolve_config_path(getattr(args, "config", None))
    if path is not None and not path.exists():
        print(f"Error: config file not found: {path}", file=sys.stderr)
        return None
    if path is None:
        search_list = ", ".join(CONFIG_SEARCH_ORDER)
        print(
            f"Error: no config file found (searched: {search_list}).\n"
            f"Run 'researchclaw init' to create one from the example template.",
            file=sys.stderr,
        )
        return None
    return path


def _generate_run_id(topic: str) -> str:
    ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
    topic_hash = hashlib.sha256(topic.encode()).hexdigest()[:6]
    return f"rc-{ts}-{topic_hash}"


def cmd_run(args: argparse.Namespace) -> int:
    resolved = _resolve_config_or_exit(args)
    if resolved is None:
        return 1
    config_path = resolved
    topic = cast(str | None, args.topic)
    output = cast(str | None, args.output)
    from_stage_name = cast(str | None, args.from_stage)
    auto_approve = cast(bool, args.auto_approve)
    skip_preflight = cast(bool, args.skip_preflight)
    resume = cast(bool, args.resume)
    skip_noncritical = cast(bool, args.skip_noncritical_stage)
    no_graceful_degradation = cast(bool, args.no_graceful_degradation)

    kb_root_path = None
    config = RCConfig.load(config_path, check_paths=False)

    # Override graceful_degradation if CLI flag is set
    if no_graceful_degradation:
        import dataclasses as _dc_gd

        new_research = _dc_gd.replace(config.research, graceful_degradation=False)
        config = _dc_gd.replace(config, research=new_research)

    # Derive gate behavior from project.mode (CLI --auto-approve overrides)
    mode = config.project.mode.lower()
    if auto_approve:
        # Explicit CLI flag takes precedence over config mode
        stop_on_gate = False
    elif mode == "full-auto":
        auto_approve = True
        stop_on_gate = False
    else:
        # "semi-auto" and "docs-first" should block on gates
        stop_on_gate = True

    if topic:
        import dataclasses

        new_research = dataclasses.replace(config.research, topic=topic)
        config = dataclasses.replace(config, research=new_research)

    # --- LLM Preflight ---
    if not skip_preflight:
        from researchclaw.llm import create_llm_client

        client = create_llm_client(config)
        print("Preflight check...", end=" ", flush=True)
        ok, msg = client.preflight()
        if ok:
            print(msg)
        else:
            print(f"FAILED — {msg}", file=sys.stderr)
            return 1

    run_id = _generate_run_id(config.research.topic)
    run_dir = Path(output or f"artifacts/{run_id}")

    # BUG-119: When --resume without --output, search for the most recent
    # existing run directory that matches the topic and has a checkpoint.
    if resume and not output:
        topic_hash = hashlib.sha256(config.research.topic.encode()).hexdigest()[:6]
        artifacts_root = Path("artifacts")
        if artifacts_root.is_dir():
            candidates = sorted(
                (
                    d for d in artifacts_root.iterdir()
                    if d.is_dir()
                    and d.name.startswith("rc-")
                    and d.name.endswith(f"-{topic_hash}")
                    and (d / "checkpoint.json").exists()
                ),
                key=lambda d: d.name,
                reverse=True,  # newest first (timestamp in name)
            )
            if candidates:
                run_dir = candidates[0]
                run_id = run_dir.name
                print(f"Found existing run to resume: {run_dir}")
            else:
                print(
                    "Warning: --resume specified but no checkpoint found "
                    f"for topic hash '{topic_hash}'. Starting new run.",
                    file=sys.stderr,
                )

    run_dir.mkdir(parents=True, exist_ok=True)

    if config.knowledge_base.root:
        kb_root_path = Path(config.knowledge_base.root)
        kb_root_path.mkdir(parents=True, exist_ok=True)

    adapters = AdapterBundle()

    from researchclaw.pipeline.runner import execute_pipeline, read_checkpoint
    from researchclaw.pipeline.stages import Stage

    # --- Determine start stage ---
    from_stage = Stage.TOPIC_INIT
    if from_stage_name:
        try:
            from_stage = Stage[from_stage_name.upper()]
        except KeyError:
            valid = ", ".join(s.name for s in Stage)
            print(
                f"Error: unknown stage '{from_stage_name}'. "
                f"Valid stages: {valid}",
                file=sys.stderr,
            )
            return 1
    elif resume:
        resumed = read_checkpoint(run_dir)
        if resumed is not None:
            from_stage = resumed
            print(f"Resuming from checkpoint: Stage {int(from_stage)}: {from_stage.name}")

    from researchclaw import __version__
    print(f"ResearchClaw v{__version__} — Starting pipeline")
    print(f"  Run ID:  {run_id}")
    print(f"  Topic:   {config.research.topic}")
    print(f"  Output:  {run_dir}")
    print(f"  Mode:    {config.project.mode}")
    print(f"  From:    Stage {int(from_stage)}: {from_stage.name}")

    # Hint: OpenCode beast mode
    exp_cfg = getattr(config, "experiment", None)
    oc_cfg = getattr(exp_cfg, "opencode", None)
    if oc_cfg and getattr(oc_cfg, "enabled", False) and not _is_opencode_installed():
        print()
        print("  Hint: OpenCode beast mode is enabled but not installed.")
        print("        Run 'researchclaw setup' to install for better code generation.")

    print()

    results = execute_pipeline(
        run_dir=run_dir,
        run_id=run_id,
        config=config,
        adapters=adapters,
        from_stage=from_stage,
        auto_approve_gates=auto_approve,
        stop_on_gate=stop_on_gate,
        skip_noncritical=skip_noncritical,
        kb_root=kb_root_path,
    )

    done = sum(1 for r in results if r.status.value == "done")
    failed = sum(1 for r in results if r.status.value == "failed")
    print(f"\nPipeline complete: {done}/{len(results)} stages done, {failed} failed")
    return 0 if failed == 0 else 1


def cmd_validate(args: argparse.Namespace) -> int:
    from researchclaw.config import validate_config
    import yaml

    resolved = _resolve_config_or_exit(args)
    if resolved is None:
        return 1
    config_path = resolved
    no_check_paths = cast(bool, args.no_check_paths)

    with config_path.open(encoding="utf-8") as f:
        loaded = cast(object, yaml.safe_load(f))

    if loaded is None:
        data: dict[str, object] = {}
    elif isinstance(loaded, dict):
        loaded_map = cast(Mapping[object, object], loaded)
        data = {str(key): value for key, value in loaded_map.items()}
    else:
        print("Config validation FAILED:")
        print("  Error: Config root must be a mapping")
        return 1

    result = validate_config(data, check_paths=not no_check_paths)
    if result.ok:
        print("Config validation passed")
        for w in result.warnings:
            print(f"  Warning: {w}")
        return 0
    else:
        print("Config validation FAILED:")
        for e in result.errors:
            print(f"  Error: {e}")
        return 1


def cmd_doctor(args: argparse.Namespace) -> int:
    resolved = _resolve_config_or_exit(args)
    if resolved is None:
        return 1
    config_path = resolved
    output = cast(str | None, args.output)

    report = run_doctor(config_path)
    print_doctor_report(report)
    if output:
        write_doctor_report(report, Path(output))
    return 0 if report.overall == "pass" else 1

def cmd_project(args: argparse.Namespace) -> int:
    """C1: Multi-project management commands."""
    from researchclaw.project.manager import ProjectManager

    action = cast(str, args.project_action)
    config_path = Path(cast(str, args.config))
    config = RCConfig.load(config_path, check_paths=False)
    pm = ProjectManager(Path(config.multi_project.projects_dir))

    if action == "list":
        projects = pm.list_all()
        if not projects:
            print("No projects found.")
        for p in projects:
            marker = " *" if pm.active and pm.active.name == p.name else ""
            print(f"  {p.name} [{p.status}]{marker}")
        return 0
    elif action == "status":
        status = pm.get_status()
        print(f"Total projects: {status['total']}")
        print(f"Active: {status.get('active', 'none')}")
        return 0
    elif action == "create":
        name = cast(str, args.name)
        topic = cast(str | None, getattr(args, "topic", None))
        proj = pm.create(name, str(config_path), topic=topic or "")
        print(f"Created project: {proj.name}")
        return 0
    elif action == "switch":
        name = cast(str, args.name)
        pm.switch(name)
        print(f"Switched to project: {name}")
        return 0
    elif action == "compare":
        names = cast(list[str], args.names)
        if len(names) != 2:
            print("Error: compare requires exactly 2 project names", file=sys.stderr)
            return 1
        result = pm.compare(names[0], names[1])
        print(f"Comparing {names[0]} vs {names[1]}:")
        for k, v in result.get("metric_diff", {}).items():
            print(f"  {k}: delta={v['delta']:.4f}")
        return 0
    else:
        print(f"Unknown project action: {action}", file=sys.stderr)
        return 1


def cmd_mcp(args: argparse.Namespace) -> int:
    """C3: MCP integration commands."""
    import asyncio

    start = cast(bool, args.start)
    if start:
        from researchclaw.mcp.server import ResearchClawMCPServer

        server = ResearchClawMCPServer()
        print("Starting MCP server...")
        asyncio.run(server.start())
        return 0
    else:
        from researchclaw.mcp.tools import list_tool_names

        names = list_tool_names()
        print("Available MCP tools:")
        for name in names:
            print(f"  {name}")
        return 0


def cmd_overleaf(args: argparse.Namespace) -> int:
    """C4: Overleaf sync commands."""
    config_path = Path(cast(str, args.config))
    config = RCConfig.load(config_path, check_paths=False)

    if not config.overleaf.enabled:
        print("Overleaf sync is not enabled in config.", file=sys.stderr)
        return 1

    from researchclaw.overleaf.sync import OverleafSync

    sync = OverleafSync(
        git_url=config.overleaf.git_url,
        branch=config.overleaf.branch,
    )

    do_sync = cast(bool, args.sync)
    do_status = cast(bool, args.status)

    if do_status:
        status = sync.get_status()
        for k, v in status.items():
            print(f"  {k}: {v}")
        return 0
    elif do_sync:
        run_dir = Path(cast(str, args.run_dir))
        if not run_dir.exists():
            print(f"Error: run_dir not found: {run_dir}", file=sys.stderr)
            return 1
        sync.setup(run_dir)
        sync.pull_changes()
        print("Overleaf sync complete.")
        return 0
    else:
        print("Use --sync or --status", file=sys.stderr)
        return 1


def cmd_serve(args: argparse.Namespace) -> int:
    """Start the FastAPI web server."""
    config_path = Path(cast(str, args.config))
    if not config_path.exists():
        print(f"Error: config file not found: {config_path}", file=sys.stderr)
        return 1

    config = RCConfig.load(config_path, check_paths=False)
    host = cast(str, args.host) or config.server.host
    port = int(cast(int, args.port) or config.server.port)

    try:
        from researchclaw.server.app import create_app
        import uvicorn
    except ImportError as exc:
        print(
            f"Error: web dependencies not installed — pip install researchclaw[web]\n{exc}",
            file=sys.stderr,
        )
        return 1

    app = create_app(config, monitor_dir=args.monitor_dir)
    uvicorn.run(app, host=host, port=port)
    return 0


def cmd_dashboard(args: argparse.Namespace) -> int:
    """Start dashboard-only server (no pipeline control)."""
    config_path = Path(cast(str, args.config))
    if not config_path.exists():
        print(f"Error: config file not found: {config_path}", file=sys.stderr)
        return 1

    config = RCConfig.load(config_path, check_paths=False)
    host = cast(str, args.host) or config.server.host
    port = int(cast(int, args.port) or config.server.port)

    try:
        from researchclaw.server.app import create_app
        import uvicorn
    except ImportError as exc:
        print(
            f"Error: web dependencies not installed — pip install researchclaw[web]\n{exc}",
            file=sys.stderr,
        )
        return 1

    app = create_app(config, dashboard_only=True, monitor_dir=args.monitor_dir)
    uvicorn.run(app, host=host, port=port)
    return 0


def cmd_wizard(args: argparse.Namespace) -> int:
    """Run the interactive setup wizard."""
    from researchclaw.wizard.quickstart import QuickStartWizard

    wizard = QuickStartWizard()
    output = cast(str | None, args.output)

    import yaml

    config = wizard.run_interactive()
    if output:
        Path(output).write_text(yaml.dump(config, default_flow_style=False))
        print(f"Config written to {output}")
    else:
        print(yaml.dump(config, default_flow_style=False))
    return 0


_PROVIDER_CHOICES = {
    "1": ("openai", "OPENAI_API_KEY"),
    "2": ("openrouter", "OPENROUTER_API_KEY"),
    "3": ("deepseek", "DEEPSEEK_API_KEY"),
    "4": ("minimax", "MINIMAX_API_KEY"),
    "5": ("acp", ""),
}

_PROVIDER_URLS = {
    "openai": "https://api.openai.com/v1",
    "openrouter": "https://openrouter.ai/api/v1",
    "deepseek": "https://api.deepseek.com/v1",
    "minimax": "https://api.minimax.io/v1",
}

_PROVIDER_MODELS = {
    "openai": ("gpt-4o", ["gpt-4.1", "gpt-4o-mini"]),
    "openrouter": (
        "anthropic/claude-3.5-sonnet",
        ["google/gemini-pro-1.5", "meta-llama/llama-3.1-70b-instruct"],
    ),
    "deepseek": ("deepseek-chat", ["deepseek-reasoner"]),
    "minimax": ("MiniMax-M2.5", ["MiniMax-M2.5-highspeed"]),
}


def cmd_init(args: argparse.Namespace) -> int:
    force = cast(bool, args.force)
    dest = Path("config.arc.yaml")

    if dest.exists() and not force:
        print(f"{dest} already exists. Use --force to overwrite.", file=sys.stderr)
        return 1

    # Look for the example config: first in repo root (relative to package),
    # then in CWD (for development), then bundled in the package data dir.
    _candidates = [
        Path(__file__).resolve().parent.parent / EXAMPLE_CONFIG,  # repo root
        Path.cwd() / EXAMPLE_CONFIG,                              # cwd fallback
        Path(__file__).resolve().parent / "data" / EXAMPLE_CONFIG, # packaged
    ]
    example = next((p for p in _candidates if p.exists()), None)
    if example is None:
        print(
            f"Error: example config not found.\n"
            f"Searched: {', '.join(str(c) for c in _candidates)}",
            file=sys.stderr,
        )
        return 1

    # Interactive provider prompt (TTY only, else default to openai)
    choice = "1"
    if sys.stdin.isatty():
        print("Select LLM provider:")
        print("  1) openai       (requires OPENAI_API_KEY)")
        print("  2) openrouter   (requires OPENROUTER_API_KEY)")
        print("  3) deepseek     (requires DEEPSEEK_API_KEY)")
        print("  4) minimax      (requires MINIMAX_API_KEY)")
        print("  5) acp          (local AI agent — no API key needed)")
        try:
            raw = input("Choice [1]: ").strip()
        except (EOFError, KeyboardInterrupt):
            raw = ""
        if raw in _PROVIDER_CHOICES:
            choice = raw

    provider, api_key_env = _PROVIDER_CHOICES[choice]

    content = example.read_text(encoding="utf-8")

    # String-based replacement to preserve YAML comments
    content = content.replace(
        'provider: "openai-compatible"', f'provider: "{provider}"'
    )

    if provider == "acp":
        # ACP doesn't need base_url or api_key
        content = content.replace(
            'base_url: "https://api.openai.com/v1"', 'base_url: ""'
        )
        content = content.replace('api_key_env: "OPENAI_API_KEY"', 'api_key_env: ""')
    else:
        base_url = _PROVIDER_URLS.get(provider, "https://api.openai.com/v1")
        content = content.replace(
            'base_url: "https://api.openai.com/v1"', f'base_url: "{base_url}"'
        )
        if api_key_env:
            content = content.replace(
                'api_key_env: "OPENAI_API_KEY"', f'api_key_env: "{api_key_env}"'
            )

    if provider in _PROVIDER_MODELS:
        primary, fallbacks = _PROVIDER_MODELS[provider]
        content = content.replace('primary_model: "gpt-4o"', f'primary_model: "{primary}"')
        # Replace fallback models block
        old_fallbacks = '  fallback_models:\n    - "gpt-4.1"\n    - "gpt-4o-mini"'
        new_fallbacks = "  fallback_models:\n" + "".join(
            f'    - "{m}"\n' for m in fallbacks
        )
        content = content.replace(old_fallbacks, new_fallbacks.rstrip("\n"))

    dest.write_text(content, encoding="utf-8")
    print(f"Created {dest} (provider: {provider})")

    if provider == "acp":
        print("\nNext steps:")
        print("  1. Ensure your ACP agent is installed and on PATH")
        print("  2. Edit config.arc.yaml to set llm.acp.agent if needed")
        print("  3. Run: researchclaw doctor")
    else:
        env_var = api_key_env or "OPENAI_API_KEY"
        print(f"\nNext steps:")
        print(f"  1. Export your API key: export {env_var}=sk-...")
        print("  2. Edit config.arc.yaml to customize your settings")
        print("  3. Run: researchclaw doctor")

    # Offer OpenCode installation
    _prompt_opencode_install()

    return 0


def cmd_setup(args: argparse.Namespace) -> int:
    """Post-install setup — check and install optional tools."""
    print("ResearchClaw — Environment Setup\n")

    # 1. OpenCode
    if _is_opencode_installed():
        try:
            opencode_cmd = shutil.which("opencode") or "opencode"
            r = subprocess.run(
                [opencode_cmd, "--version"],
                capture_output=True, text=True, timeout=15,
            )
            ver = r.stdout.strip() or "unknown"
        except Exception:  # noqa: BLE001
            ver = "unknown"
        print(f"  [OK] OpenCode is installed (version: {ver})")
    else:
        installed = _prompt_opencode_install()
        if installed:
            print("  [OK] OpenCode is now available")
        else:
            print("  [--] OpenCode not installed (beast mode will be unavailable)")

    # 2. Docker (informational)
    print()
    if shutil.which("docker"):
        print("  [OK] Docker is available (sandbox execution enabled)")
    else:
        print("  [--] Docker not found (experiment sandbox unavailable)")
        print("       Install: https://docs.docker.com/get-docker/")

    # 3. LaTeX (informational)
    if shutil.which("pdflatex"):
        print("  [OK] LaTeX is available (PDF paper compilation enabled)")
    else:
        print("  [--] LaTeX not found (paper will be exported as .tex only)")
        print("       Install: sudo apt install texlive-full  (or equivalent)")

    print()
    print("Run 'researchclaw doctor' for a full environment health check.")
    return 0


def cmd_report(args: argparse.Namespace) -> int:
    from researchclaw.report import generate_report, write_report

    run_dir = Path(cast(str, args.run_dir))
    output = cast(str | None, args.output)

    try:
        report = generate_report(run_dir)
    except (FileNotFoundError, ValueError) as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    print(report)
    if output:
        write_report(run_dir, Path(output))
        print(f"\nReport written to {output}")
    return 0


# ── Research Enhancement commands (Agent D) ───────────────────────


def cmd_trends(args: argparse.Namespace) -> int:
    """Research trend tracking commands."""
    config_path = Path(cast(str, args.config))
    if not config_path.exists():
        print(f"Error: config file not found: {config_path}", file=sys.stderr)
        return 1

    config = RCConfig.load(config_path, check_paths=False)

    import asyncio

    from researchclaw.trends.feeds import FeedManager
    from researchclaw.trends.trend_analyzer import TrendAnalyzer

    domains = cast(list[str] | None, args.domains) or list(config.research.domains)
    if not domains:
        domains = ["machine learning"]

    feed_manager = FeedManager(
        sources=config.trends.sources,
        s2_api_key=config.llm.s2_api_key,
    )

    if cast(bool, args.digest):
        from researchclaw.trends.daily_digest import DailyDigest

        digest = DailyDigest(feed_manager)
        result = asyncio.run(digest.generate(domains, config.trends.max_papers_per_day))
        print(result)
        return 0

    if cast(bool, args.analyze):
        papers = feed_manager.fetch_recent_papers(domains, max_papers=50)
        analyzer = TrendAnalyzer()
        analysis = analyzer.analyze(papers, config.trends.trend_window_days)
        print(analyzer.generate_trend_report(analysis))
        return 0

    if cast(bool, args.suggest_topics):
        from researchclaw.trends.auto_topic import AutoTopicGenerator
        from researchclaw.trends.opportunity_finder import OpportunityFinder

        papers = feed_manager.fetch_recent_papers(domains, max_papers=50)
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        generator = AutoTopicGenerator(analyzer, finder)
        candidates = asyncio.run(generator.generate_candidates(domains, papers))
        print(generator.format_candidates(candidates))
        return 0

    print("Usage: researchclaw trends --digest|--analyze|--suggest-topics")
    return 0


def cmd_calendar(args: argparse.Namespace) -> int:
    """Conference deadline calendar commands."""
    from researchclaw.calendar.deadlines import ConferenceCalendar
    from researchclaw.calendar.planner import SubmissionPlanner

    calendar = ConferenceCalendar.load_builtin()
    domains = cast(list[str] | None, args.domains)

    if cast(bool, args.upcoming):
        print(calendar.format_upcoming(domains=domains))
        return 0

    plan_venue = cast(str | None, args.plan)
    if plan_venue:
        planner = SubmissionPlanner(calendar)
        print(planner.format_plan(plan_venue))
        return 0

    print("Usage: researchclaw calendar --upcoming|--plan <venue>")
    return 0

def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="researchclaw",
        description="ResearchClaw — Autonomous Research Pipeline",
    )
    sub = parser.add_subparsers(dest="command")

    run_p = sub.add_parser("run", help="Run the 23-stage research pipeline")
    _ = run_p.add_argument("--topic", "-t", help="Override research topic")
    _ = run_p.add_argument(
        "--config", "-c", default=None,
        help="Config file (default: auto-detect config.arc.yaml or config.yaml)",
    )
    _ = run_p.add_argument("--output", "-o", help="Output directory")
    _ = run_p.add_argument(
        "--from-stage", help="Start from a specific stage (e.g. PAPER_OUTLINE)"
    )
    _ = run_p.add_argument(
        "--auto-approve", action="store_true", help="Auto-approve gate stages"
    )
    _ = run_p.add_argument(
        "--skip-preflight", action="store_true", help="Skip LLM preflight check"
    )
    _ = run_p.add_argument(
        "--resume", action="store_true", help="Resume from last checkpoint"
    )
    _ = run_p.add_argument(
        "--skip-noncritical-stage", action="store_true",
        help="Skip noncritical stages on failure instead of aborting"
    )
    _ = run_p.add_argument(
        "--no-graceful-degradation", action="store_true",
        help="Disable graceful degradation: fail pipeline on quality gate failure"
    )
    val_p = sub.add_parser("validate", help="Validate config file")
    _ = val_p.add_argument(
        "--config", "-c", default=None,
        help="Config file (default: auto-detect config.arc.yaml or config.yaml)",
    )
    _ = val_p.add_argument(
        "--no-check-paths", action="store_true", help="Skip path existence checks"
    )

    doc_p = sub.add_parser("doctor", help="Check environment and configuration health")
    _ = doc_p.add_argument(
        "--config", "-c", default=None,
        help="Config file (default: auto-detect config.arc.yaml or config.yaml)",
    )
    _ = doc_p.add_argument("--output", "-o", help="Write JSON report to file")

    init_p = sub.add_parser("init", help="Create config.arc.yaml from example template")
    _ = init_p.add_argument(
        "--force", action="store_true", help="Overwrite existing config.arc.yaml"
    )

    _ = sub.add_parser("setup", help="Check and install optional tools (OpenCode, etc.)")

    rpt_p = sub.add_parser("report", help="Generate human-readable run report")
    _ = rpt_p.add_argument(
        "--run-dir", required=True, help="Path to run artifacts directory"
    )
    _ = rpt_p.add_argument("--output", "-o", help="Write report to file")

    # A: Web platform
    srv_p = sub.add_parser("serve", help="Start the web server")
    _ = srv_p.add_argument("--config", "-c", default="config.yaml", help="Config file path")
    _ = srv_p.add_argument("--host", default="", help="Host to bind (default from config)")
    _ = srv_p.add_argument("--port", type=int, default=0, help="Port (default from config)")
    _ = srv_p.add_argument("--monitor-dir", help="Artifacts dir to monitor")

    dash_p = sub.add_parser("dashboard", help="Start dashboard-only server")
    _ = dash_p.add_argument("--config", "-c", default="config.yaml", help="Config file path")
    _ = dash_p.add_argument("--host", default="", help="Host to bind")
    _ = dash_p.add_argument("--port", type=int, default=0, help="Port")
    _ = dash_p.add_argument("--monitor-dir", help="Artifacts dir to monitor")

    wiz_p = sub.add_parser("wizard", help="Run the setup wizard")
    _ = wiz_p.add_argument("--output", "-o", help="Write config to file")

    # C1: Multi-project management
    proj_p = sub.add_parser("project", help="Multi-project management")
    _ = proj_p.add_argument(
        "project_action",
        choices=["list", "status", "create", "switch", "compare"],
        help="Project action",
    )
    _ = proj_p.add_argument("--name", "-n", help="Project name")
    _ = proj_p.add_argument("--names", nargs="*", help="Project names (for compare)")
    _ = proj_p.add_argument("--topic", "-t", help="Research topic")
    _ = proj_p.add_argument(
        "--config", "-c", default="config.yaml", help="Config file path"
    )

    # C3: MCP integration
    mcp_p = sub.add_parser("mcp", help="MCP integration")
    _ = mcp_p.add_argument(
        "--start", action="store_true", help="Start MCP server"
    )

    # C4: Overleaf sync
    ovl_p = sub.add_parser("overleaf", help="Overleaf bidirectional sync")
    _ = ovl_p.add_argument("--sync", action="store_true", help="Run sync")
    _ = ovl_p.add_argument("--status", action="store_true", help="Show status")
    _ = ovl_p.add_argument("--run-dir", help="Run artifacts directory")
    _ = ovl_p.add_argument(
        "--config", "-c", default="config.yaml", help="Config file path"
    )

    # D1: Research trend tracking
    trends_p = sub.add_parser("trends", help="Research trend tracking")
    _ = trends_p.add_argument("--digest", action="store_true", help="Generate daily digest")
    _ = trends_p.add_argument("--analyze", action="store_true", help="Analyze trends")
    _ = trends_p.add_argument(
        "--suggest-topics", action="store_true", help="Suggest research topics"
    )
    _ = trends_p.add_argument("--config", "-c", default="config.yaml", help="Config file path")
    _ = trends_p.add_argument("--domains", nargs="+", help="Override domains")

    # D4: Conference deadline calendar
    cal_p = sub.add_parser("calendar", help="Conference deadline calendar")
    _ = cal_p.add_argument("--upcoming", action="store_true", help="Show upcoming deadlines")
    _ = cal_p.add_argument("--plan", help="Generate submission timeline for a venue")
    _ = cal_p.add_argument("--domains", nargs="+", help="Filter by domain")

    args = parser.parse_args(argv)

    command = cast(str | None, args.command)

    if command == "run":
        return cmd_run(args)
    elif command == "validate":
        return cmd_validate(args)
    elif command == "doctor":
        return cmd_doctor(args)
    elif command == "init":
        return cmd_init(args)
    elif command == "setup":
        return cmd_setup(args)
    elif command == "report":
        return cmd_report(args)
    elif command == "serve":
        return cmd_serve(args)
    elif command == "dashboard":
        return cmd_dashboard(args)
    elif command == "wizard":
        return cmd_wizard(args)
    elif command == "project":
        return cmd_project(args)
    elif command == "mcp":
        return cmd_mcp(args)
    elif command == "overleaf":
        return cmd_overleaf(args)
    elif command == "trends":
        return cmd_trends(args)
    elif command == "calendar":
        return cmd_calendar(args)
    else:
        parser.print_help()
        return 0


if __name__ == "__main__":
    sys.exit(main())


================================================
FILE: researchclaw/collaboration/__init__.py
================================================
"""Agent collaboration and knowledge sharing system.

Enables multiple AutoResearchClaw instances to share research artifacts
(literature summaries, experiment results, code templates, review feedback)
through a file-system-based shared repository.
"""

from researchclaw.collaboration.repository import ResearchRepository
from researchclaw.collaboration.publisher import ArtifactPublisher
from researchclaw.collaboration.subscriber import ArtifactSubscriber
from researchclaw.collaboration.dedup import deduplicate_artifacts

__all__ = [
    "ResearchRepository",
    "ArtifactPublisher",
    "ArtifactSubscriber",
    "deduplicate_artifacts",
]


================================================
FILE: researchclaw/collaboration/dedup.py
================================================
"""Cross-instance deduplication for shared artifacts."""

from __future__ import annotations

import hashlib
import logging
from typing import Any

logger = logging.getLogger(__name__)


def content_hash(content: Any) -> str:
    """Compute a content hash for deduplication.

    Args:
        content: Content to hash (str, dict, or list).

    Returns:
        Hex digest string.
    """
    if isinstance(content, (dict, list)):
        import json
        text = json.dumps(content, sort_keys=True, default=str)
    else:
        text = str(content)

    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]


def deduplicate_artifacts(
    artifacts: list[dict[str, Any]],
    key: str = "content",
) -> list[dict[str, Any]]:
    """Remove duplicate artifacts based on content hash.

    Args:
        artifacts: List of artifact dicts.
        key: Dict key containing the content to compare.

    Returns:
        Deduplicated list, preserving first occurrence.
    """
    seen: set[str] = set()
    unique: list[dict[str, Any]] = []

    for artifact in artifacts:
        h = content_hash(artifact.get(key, ""))
        if h not in seen:
            seen.add(h)
            unique.append(artifact)

    removed = len(artifacts) - len(unique)
    if removed > 0:
        logger.info("Deduplication removed %d artifacts", removed)

    return unique


================================================
FILE: researchclaw/collaboration/publisher.py
================================================
"""Artifact publisher — extracts and publishes research artifacts from pipeline runs."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.collaboration.repository import ResearchRepository

logger = logging.getLogger(__name__)


class ArtifactPublisher:
    """Extracts artifacts from pipeline run directories and publishes them.

    Scans stage output directories for relevant files and publishes
    structured summaries to the shared repository.
    """

    def __init__(self, repository: ResearchRepository) -> None:
        self._repo = repository

    def publish_from_run_dir(
        self,
        run_id: str,
        run_dir: Path,
    ) -> int:
        """Extract and publish all artifacts from a pipeline run directory.

        Args:
            run_id: Unique run identifier.
            run_dir: Path to the pipeline run output directory.

        Returns:
            Number of artifacts published.
        """
        artifacts: dict[str, Any] = {}

        # Literature summary (from stage 7 - synthesis)
        lit_summary = self._extract_literature(run_dir)
        if lit_summary:
            artifacts["literature_summary"] = lit_summary

        # Experiment results (from stage 14 - result_analysis)
        exp_results = self._extract_experiments(run_dir)
        if exp_results:
            artifacts["experiment_results"] = exp_results

        # Code template (from stage 10 - code_generation)
        code_template = self._extract_code(run_dir)
        if code_template:
            artifacts["code_template"] = code_template

        # Review feedback (from stage 18 - peer_review)
        review = self._extract_review(run_dir)
        if review:
            artifacts["review_feedback"] = review

        if not artifacts:
            logger.info("No artifacts found in run dir: %s", run_dir)
            return 0

        return self._repo.publish(run_id, artifacts)

    def _extract_literature(self, run_dir: Path) -> Any:
        """Extract literature summary from stage 7."""
        for stage_dir in run_dir.glob("stage-07*"):
            for name in ("synthesis.md", "synthesis.json"):
                path = stage_dir / name
                if path.exists():
                    return path.read_text(encoding="utf-8")[:5000]
        return None

    def _extract_experiments(self, run_dir: Path) -> Any:
        """Extract experiment results from stage 14."""
        for stage_dir in run_dir.glob("stage-14*"):
            summary = stage_dir / "experiment_summary.json"
            if summary.exists():
                try:
                    return json.loads(summary.read_text(encoding="utf-8"))
                except json.JSONDecodeError:
                    pass
        return None

    def _extract_code(self, run_dir: Path) -> Any:
        """Extract code template from stage 10."""
        for stage_dir in run_dir.glob("stage-10*"):
            main_py = stage_dir / "main.py"
            if main_py.exists():
                return main_py.read_text(encoding="utf-8")[:10000]
        return None

    def _extract_review(self, run_dir: Path) -> Any:
        """Extract review feedback from stage 18."""
        for stage_dir in run_dir.glob("stage-18*"):
            review = stage_dir / "review.md"
            if review.exists():
                return review.read_text(encoding="utf-8")[:5000]
        return None


================================================
FILE: researchclaw/collaboration/repository.py
================================================
"""Shared knowledge repository for cross-instance collaboration."""

from __future__ import annotations

import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

ARTIFACT_TYPES = (
    "literature_summary",
    "experiment_results",
    "code_template",
    "review_feedback",
)


class ResearchRepository:
    """File-system-based shared knowledge repository.

    Artifacts are organized by run_id and type, enabling cross-instance
    search and import of research outputs.
    """

    def __init__(self, repo_dir: str | Path = ".researchclaw/shared") -> None:
        self._repo_dir = Path(repo_dir)

    @property
    def repo_dir(self) -> Path:
        """Return the repository directory path."""
        return self._repo_dir

    def publish(self, run_id: str, artifacts: dict[str, Any]) -> int:
        """Publish research artifacts to the shared repository.

        Args:
            run_id: Unique identifier for the pipeline run.
            artifacts: Dict mapping artifact type to content.
                Supported types: literature_summary, experiment_results,
                code_template, review_feedback.

        Returns:
            Number of artifacts published.
        """
        run_dir = self._repo_dir / run_id
        run_dir.mkdir(parents=True, exist_ok=True)

        count = 0
        for artifact_type, content in artifacts.items():
            if artifact_type not in ARTIFACT_TYPES:
                logger.warning("Unknown artifact type: %s", artifact_type)
                continue

            artifact_path = run_dir / f"{artifact_type}.json"
            payload = {
                "run_id": run_id,
                "type": artifact_type,
                "content": content,
                "published_at": datetime.now(timezone.utc).isoformat(
                    timespec="seconds"
                ),
            }
            artifact_path.write_text(
                json.dumps(payload, indent=2, ensure_ascii=False, default=str),
                encoding="utf-8",
            )
            count += 1

        logger.info("Published %d artifacts for run %s", count, run_id)
        return count

    def search(
        self,
        query: str,
        artifact_type: str | None = None,
        max_results: int = 10,
    ) -> list[dict[str, Any]]:
        """Search for artifacts matching a query.

        Simple keyword-based search (case-insensitive substring match).

        Args:
            query: Search query string.
            artifact_type: Filter by type (optional).
            max_results: Maximum number of results.

        Returns:
            List of matching artifact dicts.
        """
        if not self._repo_dir.exists():
            return []

        results: list[dict[str, Any]] = []
        query_lower = query.lower()

        for run_dir in sorted(self._repo_dir.iterdir(), reverse=True):
            if not run_dir.is_dir():
                continue

            for artifact_file in run_dir.glob("*.json"):
                if artifact_type and artifact_file.stem != artifact_type:
                    continue

                try:
                    payload = json.loads(
                        artifact_file.read_text(encoding="utf-8")
                    )
                except (json.JSONDecodeError, OSError):
                    continue

                content_str = json.dumps(payload.get("content", ""), default=str).lower()
                if query_lower in content_str:
                    results.append(payload)
                    if len(results) >= max_results:
                        return results

        return results

    def list_runs(self) -> list[str]:
        """List all run IDs in the repository.

        Returns:
            List of run ID strings, most recent first.
        """
        if not self._repo_dir.exists():
            return []
        return sorted(
            [d.name for d in self._repo_dir.iterdir() if d.is_dir()],
            reverse=True,
        )

    def get_run_artifacts(self, run_id: str) -> dict[str, Any]:
        """Get all artifacts for a specific run.

        Args:
            run_id: The run identifier.

        Returns:
            Dict mapping artifact type to content.
        """
        run_dir = self._repo_dir / run_id
        if not run_dir.exists():
            return {}

        artifacts: dict[str, Any] = {}
        for artifact_file in run_dir.glob("*.json"):
            try:
                payload = json.loads(artifact_file.read_text(encoding="utf-8"))
                artifacts[artifact_file.stem] = payload.get("content")
            except (json.JSONDecodeError, OSError):
                continue

        return artifacts

    def import_literature(self, source_run_id: str) -> list[dict[str, Any]]:
        """Import literature summaries from another run.

        Args:
            source_run_id: The source run ID to import from.

        Returns:
            List of literature summary dicts.
        """
        artifacts = self.get_run_artifacts(source_run_id)
        content = artifacts.get("literature_summary")
        if content is None:
            return []

        if isinstance(content, list):
            return content
        return [content]

    def import_code_template(
        self,
        source_run_id: str,
        pattern: str,
    ) -> str | None:
        """Import a code template from another run.

        Args:
            source_run_id: The source run ID.
            pattern: Substring to search for in the code template.

        Returns:
            Matching code template string, or None.
        """
        artifacts = self.get_run_artifacts(source_run_id)
        content = artifacts.get("code_template")
        if content is None:
            return None

        content_str = str(content)
        if pattern.lower() in content_str.lower():
            return content_str

        return None


================================================
FILE: researchclaw/collaboration/subscriber.py
================================================
"""Artifact subscriber — queries and imports shared artifacts."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.collaboration.repository import ResearchRepository

logger = logging.getLogger(__name__)


class ArtifactSubscriber:
    """Subscribes to and imports artifacts from the shared repository.

    Provides convenience methods for finding and importing relevant
    artifacts for a new pipeline run.
    """

    def __init__(self, repository: ResearchRepository) -> None:
        self._repo = repository

    def find_relevant_literature(
        self,
        topic: str,
        max_results: int = 5,
    ) -> list[dict[str, Any]]:
        """Find literature summaries relevant to a topic.

        Args:
            topic: Research topic to search for.
            max_results: Maximum results.

        Returns:
            List of matching literature artifacts.
        """
        return self._repo.search(
            topic, artifact_type="literature_summary", max_results=max_results
        )

    def find_similar_experiments(
        self,
        query: str,
        max_results: int = 5,
    ) -> list[dict[str, Any]]:
        """Find experiment results similar to the current task.

        Args:
            query: Search query describing the experiment.
            max_results: Maximum results.

        Returns:
            List of matching experiment artifacts.
        """
        return self._repo.search(
            query, artifact_type="experiment_results", max_results=max_results
        )

    def find_code_templates(
        self,
        query: str,
        max_results: int = 3,
    ) -> list[dict[str, Any]]:
        """Find reusable code templates.

        Args:
            query: Search query.
            max_results: Maximum results.

        Returns:
            List of matching code template artifacts.
        """
        return self._repo.search(
            query, artifact_type="code_template", max_results=max_results
        )

    def import_best_practices(
        self,
        topic: str,
    ) -> str:
        """Compile best practices from historical runs for a topic.

        Args:
            topic: Current research topic.

        Returns:
            Formatted string of best practices for prompt injection.
        """
        parts: list[str] = []

        # Literature insights
        lit_results = self.find_relevant_literature(topic, max_results=3)
        if lit_results:
            parts.append("### Related Literature (from prior runs)")
            for result in lit_results:
                content = result.get("content", "")
                if isinstance(content, str):
                    parts.append(f"- {content[:200]}...")
                run_id = result.get("run_id", "?")
                parts.append(f"  (from run: {run_id})")

        # Experiment insights
        exp_results = self.find_similar_experiments(topic, max_results=3)
        if exp_results:
            parts.append("\n### Related Experiments (from prior runs)")
            for result in exp_results:
                run_id = result.get("run_id", "?")
                parts.append(f"- Experiment from run {run_id}")

        if not parts:
            return ""

        return "\n".join(parts)


================================================
FILE: researchclaw/config.py
================================================
"""ResearchClaw config loading and validation."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any

import sys
import yaml

DEFAULT_PYTHON_PATH = ".venv/Scripts/python.exe" if sys.platform == "win32" else ".venv/bin/python3"

CONFIG_SEARCH_ORDER: tuple[str, ...] = ("config.arc.yaml", "config.yaml")


def _safe_int(val: Any, default: int) -> int:
    """Convert value to int, handling None/null YAML values."""
    if val is None:
        return default
    try:
        return int(val)
    except (ValueError, TypeError):
        return default


_VALID_NETWORK_POLICIES = {"none", "setup_only", "pip_only", "full"}


def _validate_network_policy(val: object, default: str = "setup_only") -> str:
    """Validate network_policy, falling back to *default* on bad values."""
    s = str(val).strip().lower() if val else default
    if s not in _VALID_NETWORK_POLICIES:
        import logging as _cfg_log
        _cfg_log.getLogger(__name__).warning(
            "Invalid network_policy %r, using %r", val, default,
        )
        return default
    return s


def _safe_float(val: Any, default: float) -> float:
    """Convert value to float, handling None/null YAML values.

    BUG-DA8-11: Also rejects NaN/Inf which YAML can produce via .nan/.inf.
    """
    if val is None:
        return default
    try:
        import math
        result = float(val)
        if not math.isfinite(result):
            return default
        return result
    except (ValueError, TypeError):
        return default
EXAMPLE_CONFIG = "config.researchclaw.example.yaml"


def resolve_config_path(explicit: str | None) -> Path | None:
    """Return first existing config from search order, or explicit path if given."""
    if explicit is not None:
        return Path(explicit)
    for name in CONFIG_SEARCH_ORDER:
        candidate = Path(name)
        if candidate.exists():
            return candidate
    return None


REQUIRED_FIELDS = (
    "project.name",
    "research.topic",
    "runtime.timezone",
    "notifications.channel",
    "knowledge_base.root",
    "llm.base_url",
    "llm.api_key_env",
)
KB_SUBDIRS = (
    "questions",
    "literature",
    "experiments",
    "findings",
    "decisions",
    "reviews",
)
PROJECT_MODES = {"docs-first", "semi-auto", "full-auto"}
KB_BACKENDS = {"markdown", "obsidian"}
EXPERIMENT_MODES = {"simulated", "sandbox", "docker", "ssh_remote", "colab_drive", "agentic"}
CLI_AGENT_PROVIDERS = {"llm", "claude_code", "codex"}


def _get_by_path(data: dict[str, Any], dotted_key: str) -> Any:
    cur: Any = data
    for part in dotted_key.split("."):
        if not isinstance(cur, dict) or part not in cur:
            return None
        cur = cur[part]
    return cur


def _is_blank(value: Any) -> bool:
    return value is None or (isinstance(value, str) and not value.strip())


@dataclass(frozen=True)
class ValidationResult:
    ok: bool
    errors: tuple[str, ...] = ()
    warnings: tuple[str, ...] = ()


@dataclass(frozen=True)
class ProjectConfig:
    name: str
    mode: str = "docs-first"


@dataclass(frozen=True)
class ResearchConfig:
    topic: str
    domains: tuple[str, ...] = ()
    daily_paper_count: int = 0
    quality_threshold: float = 0.0
    graceful_degradation: bool = True


@dataclass(frozen=True)
class RuntimeConfig:
    timezone: str
    max_parallel_tasks: int = 1
    approval_timeout_hours: int = 12
    retry_limit: int = 0


@dataclass(frozen=True)
class NotificationsConfig:
    channel: str
    target: str = ""
    on_stage_start: bool = False
    on_stage_fail: bool = False
    on_gate_required: bool = True


@dataclass(frozen=True)
class KnowledgeBaseConfig:
    backend: str
    root: str
    obsidian_vault: str = ""


@dataclass(frozen=True)
class OpenClawBridgeConfig:
    use_cron: bool = False
    use_message: bool = False
    use_memory: bool = False
    use_sessions_spawn: bool = False
    use_web_fetch: bool = False
    use_browser: bool = False


@dataclass(frozen=True)
class AcpConfig:
    """ACP (Agent Client Protocol) settings."""

    agent: str = "claude"
    cwd: str = "."
    acpx_command: str = ""
    session_name: str = "researchclaw"
    timeout_sec: int = 1800


@dataclass(frozen=True)
class LlmConfig:
    provider: str
    base_url: str = ""
    api_key_env: str = ""
    api_key: str = ""
    primary_model: str = ""
    fallback_models: tuple[str, ...] = ()
    s2_api_key: str = ""
    notes: str = ""
    acp: AcpConfig = field(default_factory=AcpConfig)


@dataclass(frozen=True)
class SecurityConfig:
    hitl_required_stages: tuple[int, ...] = (5, 9, 20)
    allow_publish_without_approval: bool = False
    redact_sensitive_logs: bool = True


@dataclass(frozen=True)
class SandboxConfig:
    python_path: str = DEFAULT_PYTHON_PATH
    gpu_required: bool = False
    allowed_imports: tuple[str, ...] = (
        "math",
        "random",
        "json",
        "csv",
        "numpy",
        "torch",
        "sklearn",
    )
    max_memory_mb: int = 4096


@dataclass(frozen=True)
class SshRemoteConfig:
    host: str = ""
    user: str = ""
    port: int = 22
    key_path: str = ""
    gpu_ids: tuple[int, ...] = ()
    remote_workdir: str = "/tmp/researchclaw_experiments"
    remote_python: str = "python3"
    setup_commands: tuple[str, ...] = ()
    use_docker: bool = False
    docker_image: str = "researchclaw/experiment:latest"
    docker_network_policy: str = "none"
    docker_memory_limit_mb: int = 8192
    docker_shm_size_mb: int = 2048
    timeout_sec: int = 600  # default 10 min for experiment execution
    scp_timeout_sec: int = 300  # default 5 min for file uploads
    setup_timeout_sec: int = 300  # default 5 min for setup commands


@dataclass(frozen=True)
class ColabDriveConfig:
    """Configuration for Google Drive-based async Colab execution."""

    drive_root: str = ""  # local mount path, e.g. ~/Google Drive/MyDrive/researchclaw
    poll_interval_sec: int = 30
    timeout_sec: int = 3600
    setup_script: str = ""  # commands to run before experiment, written to setup.sh


@dataclass(frozen=True)
class DockerSandboxConfig:
    """Configuration for Docker-based experiment sandbox."""

    image: str = "researchclaw/experiment:latest"
    gpu_enabled: bool = True
    gpu_device_ids: tuple[int, ...] = ()
    memory_limit_mb: int = 8192
    network_policy: str = "setup_only"  # none | setup_only | pip_only | full
    pip_pre_install: tuple[str, ...] = ()
    auto_install_deps: bool = True
    shm_size_mb: int = 2048
    container_python: str = "/usr/bin/python3"
    keep_containers: bool = False


@dataclass(frozen=True)
class AgenticConfig:
    """Configuration for the agentic experiment mode.

    Launches a coding agent (e.g. Claude Code) inside a Docker container
    with full shell access so it can run arbitrary CLI commands, write code,
    and iteratively complete the experiment.
    """

    image: str = "researchclaw/experiment:latest"
    agent_cli: str = "claude"
    agent_install_cmd: str = "npm install -g @anthropic-ai/claude-code"
    network_policy: str = "full"  # Agent needs network access
    timeout_sec: int = 1800  # 30 min per session
    memory_limit_mb: int = 8192
    gpu_enabled: bool = False
    mount_skills: bool = True
    allow_shell_commands: bool = True
    max_turns: int = 50


@dataclass(frozen=True)
class CodeAgentConfig:
    """Configuration for the advanced multi-phase code generation agent."""

    enabled: bool = True
    # Phase 1: Blueprint planning (deep implementation blueprint)
    architecture_planning: bool = True
    # Phase 2: Sequential file generation (one-by-one following blueprint)
    sequential_generation: bool = True
    # Phase 2.5: Hard validation gates (AST-based)
    hard_validation: bool = True
    hard_validation_max_repairs: int = 4
    # Phase 3: Execution-in-the-loop (run → parse error → fix)
    exec_fix_max_iterations: int = 3
    exec_fix_timeout_sec: int = 60
    # Phase 4: Solution tree search (off by default — higher cost)
    tree_search_enabled: bool = False
    tree_search_candidates: int = 3
    tree_search_max_depth: int = 2
    tree_search_eval_timeout_sec: int = 120
    # Phase 5: Multi-agent review dialog
    review_max_rounds: int = 2


@dataclass(frozen=True)
class OpenCodeConfig:
    """OpenCode 'Beast Mode' — external AI coding agent for complex experiments.

    Requires: npm i -g opencode-ai@latest
    """

    enabled: bool = True
    auto: bool = True  # Auto-trigger without user confirmation
    complexity_threshold: float = 0.2  # 0.0-1.0
    model: str = ""  # Empty = use llm.primary_model
    timeout_sec: int = 600  # Max seconds for opencode run
    max_retries: int = 1
    workspace_cleanup: bool = True


@dataclass(frozen=True)
class BenchmarkAgentConfig:
    """Configuration for the BenchmarkAgent multi-agent system."""

    enabled: bool = True
    # Surveyor
    enable_hf_search: bool = True
    max_hf_results: int = 10
    # Surveyor — web search
    enable_web_search: bool = True
    max_web_results: int = 5
    web_search_min_local: int = 3  # skip web search when local benchmarks >= this
    # Selector
    tier_limit: int = 2
    min_benchmarks: int = 1
    min_baselines: int = 2
    prefer_cached: bool = True
    # Orchestrator
    max_iterations: int = 2


@dataclass(frozen=True)
class FigureAgentConfig:
    """Configuration for the FigureAgent multi-agent system."""

    enabled: bool = True
    # Planner
    min_figures: int = 3
    max_figures: int = 8
    # Orchestrator
    max_iterations: int = 3  # max CodeGen→Renderer→Critic retry loops
    # Renderer security
    render_timeout_sec: int = 30
    use_docker: bool | None = None  # None = auto-detect, True/False to force
    docker_image: str = "researchclaw/experiment:latest"
    # Code generation output format
    output_format: str = "python"  # "python" (matplotlib) or "latex" (TikZ/PGFPlots)
    # Nano Banana (Gemini image generation)
    gemini_api_key: str = ""  # or set GEMINI_API_KEY / GOOGLE_API_KEY env var
    gemini_model: str = "gemini-2.5-flash-image"
    nano_banana_enabled: bool = True  # enable/disable Gemini image generation
    # Critic
    strict_mode: bool = False
    # Output
    dpi: int = 300


@dataclass(frozen=True)
class ExperimentRepairConfig:
    """Experiment repair loop — diagnose and fix failed experiments before paper writing.

    When enabled, after Stage 14 (result_analysis) the pipeline:
    1. Diagnoses experiment failures (missing deps, crashes, OOM, time guard, etc.)
    2. Assesses experiment quality (full_paper / preliminary_study / technical_report)
    3. If quality is insufficient, generates targeted repair prompts
    4. Re-runs experiment with fixes, up to ``max_cycles`` times
    5. Selects best results across all cycles for paper writing
    """

    enabled: bool = True
    max_cycles: int = 3
    min_completion_rate: float = 0.5  # At least 50% conditions must complete
    min_conditions: int = 2  # At least 2 conditions for a valid experiment
    use_opencode: bool = True  # Use OpenCode agent for repairs (vs LLM prompt)
    timeout_sec_per_cycle: int = 600  # Max time per repair cycle


@dataclass(frozen=True)
class CliAgentConfig:
    """CLI-based code generation backend for Stages 10 & 13.

    provider: "llm"          — use existing LLM chat API (default, backward-compatible)
              "claude_code"  — Claude Code CLI (``claude -p``)
              "codex"        — OpenAI Codex CLI (``codex exec``)

    Auth for claude_code: ANTHROPIC_AUTH_TOKEN + ANTHROPIC_BASE_URL env vars.
    Auth for codex:       OPENAI_API_KEY env var.
    """

    provider: str = "llm"
    binary_path: str = ""       # auto-detected via PATH if empty
    model: str = ""             # model override for the CLI agent
    max_budget_usd: float = 5.0
    timeout_sec: int = 600
    extra_args: tuple[str, ...] = ()


@dataclass(frozen=True)
class ExperimentConfig:
    mode: str = "simulated"
    time_budget_sec: int = 300
    max_iterations: int = 10
    max_refine_duration_sec: int = 0  # 0 = auto (3× time_budget_sec)
    metric_key: str = "primary_metric"
    metric_direction: str = "minimize"
    keep_threshold: float = 0.0
    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
    docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig)
    agentic: AgenticConfig = field(default_factory=AgenticConfig)
    ssh_remote: SshRemoteConfig = field(default_factory=SshRemoteConfig)
    colab_drive: ColabDriveConfig = field(default_factory=ColabDriveConfig)
    code_agent: CodeAgentConfig = field(default_factory=CodeAgentConfig)
    opencode: OpenCodeConfig = field(default_factory=OpenCodeConfig)
    benchmark_agent: BenchmarkAgentConfig = field(default_factory=BenchmarkAgentConfig)
    figure_agent: FigureAgentConfig = field(default_factory=FigureAgentConfig)
    repair: ExperimentRepairConfig = field(default_factory=ExperimentRepairConfig)
    cli_agent: CliAgentConfig = field(default_factory=CliAgentConfig)


@dataclass(frozen=True)
class MetaClawPRMConfig:
    """PRM quality gate settings for MetaClaw bridge."""

    enabled: bool = False
    api_base: str = ""
    api_key_env: str = ""
    api_key: str = ""
    model: str = "gpt-5.4"
    votes: int = 3
    temperature: float = 0.6
    gate_stages: tuple[int, ...] = (5, 9, 15, 20)


@dataclass(frozen=True)
class MetaClawLessonToSkillConfig:
    """Settings for converting lessons into MetaClaw skills."""

    enabled: bool = True
    min_severity: str = "warning"
    max_skills_per_run: int = 3


@dataclass(frozen=True)
class MetaClawBridgeConfig:
    """MetaClaw integration bridge configuration."""

    enabled: bool = False
    proxy_url: str = "http://localhost:30000"
    skills_dir: str = "~/.metaclaw/skills"
    fallback_url: str = ""
    fallback_api_key: str = ""
    prm: MetaClawPRMConfig = field(default_factory=MetaClawPRMConfig)
    lesson_to_skill: MetaClawLessonToSkillConfig = field(
        default_factory=MetaClawLessonToSkillConfig
    )


@dataclass(frozen=True)
class WebSearchConfig:
    """Configuration for web search and crawling capabilities."""

    enabled: bool = True
    tavily_api_key: str = ""
    tavily_api_key_env: str = "TAVILY_API_KEY"
    enable_scholar: bool = True
    enable_crawling: bool = True
    enable_pdf_extraction: bool = True
    max_web_results: int = 10
    max_scholar_results: int = 10
    max_crawl_urls: int = 5


@dataclass(frozen=True)
class ExportConfig:
    """Configuration for paper export and LaTeX generation."""

    target_conference: str = "neurips_2025"
    authors: str = "Anonymous"
    bib_file: str = "references"

@dataclass(frozen=True)
class PromptsConfig:
    """Configuration for prompt externalization."""

    custom_file: str = ""  # Path to custom prompts YAML (empty = use defaults)


# ── Agent B: Intelligence & Memory configs ────────────────────────


@dataclass(frozen=True)
class MemoryConfig:
    """Configuration for the persistent evolutionary memory system."""

    enabled: bool = True
    store_dir: str = ".researchclaw/memory"
    embedding_model: str = "text-embedding-3-small"
    max_entries_per_category: int = 500
    decay_half_life_days: int = 90
    confidence_threshold: float = 0.3
    inject_at_stages: tuple[int, ...] = (1, 9, 10, 17)


@dataclass(frozen=True)
class SkillsConfig:
    """Configuration for the dynamic skills library."""

    enabled: bool = True
    builtin_dir: str = ""  # empty = use package default
    custom_dirs: tuple[str, ...] = ()
    external_dirs: tuple[str, ...] = ()
    auto_match: bool = True
    max_skills_per_stage: int = 3
    fallback_matching: bool = True


@dataclass(frozen=True)
class KnowledgeGraphConfig:
    """Configuration for the research knowledge graph."""

    enabled: bool = False
    store_path: str = ".researchclaw/knowledge_graph"
    max_entities: int = 10000
    auto_update: bool = True


# ── Web platform configs (Agent A) ──────────────────────────────


@dataclass(frozen=True)
class ServerConfig:
    """Web server configuration."""

    enabled: bool = False
    host: str = "0.0.0.0"
    port: int = 8080
    cors_origins: tuple[str, ...] = ("*",)
    auth_token: str = ""  # empty = no authentication
    voice_enabled: bool = False
    whisper_model: str = "whisper-1"
    whisper_api_url: str = ""  # empty = use OpenAI default


@dataclass(frozen=True)
class DashboardConfig:
    """Dashboard configuration."""

    enabled: bool = True
    refresh_interval_sec: int = 5
    max_log_lines: int = 1000
    browser_notifications: bool = True


# ── Agent C: Infrastructure configs ────────────────────────────────


@dataclass(frozen=True)
class MultiProjectConfig:
    """C1: Multi-project parallel management."""

    enabled: bool = False
    projects_dir: str = ".researchclaw/projects"
    max_concurrent: int = 2
    shared_knowledge: bool = True


@dataclass(frozen=True)
class ServerEntryConfig:
    """Single compute server entry for C2."""

    name: str = ""
    host: str = ""
    server_type: str = "ssh"
    gpu: str = ""
    vram_gb: int = 0
    priority: int = 1
    cost_per_hour: float = 0.0
    scheduler: str = ""
    cloud_provider: str = ""


@dataclass(frozen=True)
class ServersConfig:
    """C2: Multi-server resource scheduling."""

    enabled: bool = False
    servers: tuple[ServerEntryConfig, ...] = ()
    prefer_free: bool = True
    failover: bool = True
    monitor_interval_sec: int = 60


@dataclass(frozen=True)
class MCPIntegrationConfig:
    """C3: MCP standardized integration."""

    server_enabled: bool = False
    server_port: int = 3000
    server_transport: str = "stdio"
    external_servers: tuple[dict, ...] = ()


@dataclass(frozen=True)
class OverleafConfig:
    """C4: Overleaf bidirectional sync."""

    enabled: bool = False
    git_url: str = ""
    branch: str = "main"
    auto_push: bool = True
    auto_pull: bool = False
    poll_interval_sec: int = 300


COPILOT_MODES = ("co-pilot", "auto-pilot", "zero-touch")


@dataclass(frozen=True)
class TrendsConfig:
    """D1: Research trend tracking."""

    enabled: bool = False
    domains: tuple[str, ...] = ()
    daily_digest: bool = True
    digest_time: str = "08:00"
    max_papers_per_day: int = 20
    trend_window_days: int = 30
    sources: tuple[str, ...] = ("arxiv", "semantic_scholar")


@dataclass(frozen=True)
class CoPilotConfig:
    """D2: Interactive co-pilot mode."""

    mode: str = "auto-pilot"
    pause_at_gates: bool = True
    pause_at_every_stage: bool = False
    feedback_timeout_sec: int = 3600
    allow_branching: bool = True
    max_branches: int = 3


@dataclass(frozen=True)
class QualityAssessorConfig:
    """D3: Paper quality assessor."""

    enabled: bool = True
    dimensions: tuple[str, ...] = (
        "novelty", "rigor", "clarity", "impact", "experiments"
    )
    venue_recommendation: bool = True
    score_history: bool = True


@dataclass(frozen=True)
class CalendarConfig:
    """D4: Conference deadline calendar."""

    enabled: bool = False
    target_venues: tuple[str, ...] = ()
    reminder_days_before: tuple[int, ...] = (30, 14, 7, 3, 1)
    auto_plan: bool = True


@dataclass(frozen=True)
class RCConfig:
    project: ProjectConfig
    research: ResearchConfig
    runtime: RuntimeConfig
    notifications: NotificationsConfig
    knowledge_base: KnowledgeBaseConfig
    openclaw_bridge: OpenClawBridgeConfig
    llm: LlmConfig
    security: SecurityConfig = field(default_factory=SecurityConfig)
    experiment: ExperimentConfig = field(default_factory=ExperimentConfig)
    export: ExportConfig = field(default_factory=ExportConfig)
    prompts: PromptsConfig = field(default_factory=PromptsConfig)
    web_search: WebSearchConfig = field(default_factory=WebSearchConfig)
    metaclaw_bridge: MetaClawBridgeConfig = field(
        default_factory=MetaClawBridgeConfig
    )
    # Agent B: Intelligence & Memory
    memory: MemoryConfig = field(default_factory=MemoryConfig)
    skills: SkillsConfig = field(default_factory=SkillsConfig)
    knowledge_graph: KnowledgeGraphConfig = field(default_factory=KnowledgeGraphConfig)
    # Agent C: Infrastructure
    multi_project: MultiProjectConfig = field(default_factory=MultiProjectConfig)
    compute_servers: ServersConfig = field(default_factory=ServersConfig)
    mcp: MCPIntegrationConfig = field(default_factory=MCPIntegrationConfig)
    overleaf: OverleafConfig = field(default_factory=OverleafConfig)
    # Agent A: Web platform
    server: ServerConfig = field(default_factory=ServerConfig)
    dashboard: DashboardConfig = field(default_factory=DashboardConfig)
    # Agent D: Research Enhancement
    trends: TrendsConfig = field(default_factory=TrendsConfig)
    copilot: CoPilotConfig = field(default_factory=CoPilotConfig)
    quality_assessor: QualityAssessorConfig = field(default_factory=QualityAssessorConfig)
    calendar: CalendarConfig = field(default_factory=CalendarConfig)

    def to_dict(self) -> dict[str, Any]:
        return asdict(self)

    @classmethod
    def from_dict(
        cls,
        data: dict[str, Any],
        *,
        project_root: Path | None = None,
        check_paths: bool = True,
    ) -> RCConfig:
        result = validate_config(
            data, project_root=project_root, check_paths=check_paths
        )
        if not result.ok:
            raise ValueError("; ".join(result.errors))

        project = data["project"]
        research = data["research"]
        runtime = data["runtime"]
        notifications = data["notifications"]
        knowledge_base = data["knowledge_base"]
        bridge = data.get("openclaw_bridge") or {}
        llm = data["llm"]
        security = data.get("security") or {}
        experiment = data.get("experiment") or {}
        export = data.get("export") or {}
        prompts = data.get("prompts") or {}
        web_search = data.get("web_search") or {}
        metaclaw = data.get("metaclaw_bridge") or {}
        memory_data = data.get("memory") or {}
        skills_data = data.get("skills") or {}
        knowledge_graph_data = data.get("knowledge_graph") or {}
        multi_project = data.get("multi_project") or {}
        compute_servers = data.get("compute_servers") or {}
        mcp_data = data.get("mcp") or {}
        overleaf = data.get("overleaf") or {}
        server = data.get("server") or {}
        dashboard_data = data.get("dashboard") or {}
        trends_data = data.get("trends") or {}
        copilot_data = data.get("copilot") or {}
        quality_assessor_data = data.get("quality_assessor") or {}
        calendar_data = data.get("calendar") or {}

        return cls(
            project=ProjectConfig(
                name=project["name"], mode=project.get("mode", "docs-first")
            ),
            research=ResearchConfig(
                topic=research["topic"],
                domains=tuple(research.get("domains") or ()),
                daily_paper_count=int(research.get("daily_paper_count", 0)),
                quality_threshold=float(research.get("quality_threshold", 0.0)),
                graceful_degradation=bool(research.get("graceful_degradation", True)),
            ),
            runtime=RuntimeConfig(
                timezone=runtime["timezone"],
                max_parallel_tasks=int(runtime.get("max_parallel_tasks", 1)),
                approval_timeout_hours=int(runtime.get("approval_timeout_hours", 12)),
                retry_limit=int(runtime.get("retry_limit", 0)),
            ),
            notifications=NotificationsConfig(
                channel=notifications["channel"],
                target=notifications.get("target", ""),
                on_stage_start=bool(notifications.get("on_stage_start", False)),
                on_stage_fail=bool(notifications.get("on_stage_fail", False)),
                on_gate_required=bool(notifications.get("on_gate_required", True)),
            ),
            knowledge_base=KnowledgeBaseConfig(
                backend=knowledge_base.get("backend", "markdown"),
                root=knowledge_base["root"],
                obsidian_vault=knowledge_base.get("obsidian_vault", ""),
            ),
            openclaw_bridge=OpenClawBridgeConfig(
                use_cron=bool(bridge.get("use_cron", False)),
                use_message=bool(bridge.get("use_message", False)),
                use_memory=bool(bridge.get("use_memory", False)),
                use_sessions_spawn=bool(bridge.get("use_sessions_spawn", False)),
                use_web_fetch=bool(bridge.get("use_web_fetch", False)),
                use_browser=bool(bridge.get("use_browser", False)),
            ),
            llm=_parse_llm_config(llm),
            security=SecurityConfig(
                hitl_required_stages=tuple(
                    int(s) for s in security.get("hitl_required_stages", (5, 9, 20))
                ),
                allow_publish_without_approval=bool(
                    security.get("allow_publish_without_approval", False)
                ),
                redact_sensitive_logs=bool(security.get("redact_sensitive_logs", True)),
            ),
            experiment=_parse_experiment_config(experiment),
            export=ExportConfig(
                target_conference=export.get("target_conference", "neurips_2025"),
                authors=export.get("authors", "Anonymous"),
                bib_file=export.get("bib_file", "references"),
            ),
            prompts=PromptsConfig(
                custom_file=prompts.get("custom_file", ""),
            ),
            web_search=WebSearchConfig(
                enabled=bool(web_search.get("enabled", True)),
                tavily_api_key=str(web_search.get("tavily_api_key", "")),
                tavily_api_key_env=str(web_search.get("tavily_api_key_env", "TAVILY_API_KEY")),
                enable_scholar=bool(web_search.get("enable_scholar", True)),
                enable_crawling=bool(web_search.get("enable_crawling", True)),
                enable_pdf_extraction=bool(web_search.get("enable_pdf_extraction", True)),
                max_web_results=int(web_search.get("max_web_results", 10)),
                max_scholar_results=int(web_search.get("max_scholar_results", 10)),
                max_crawl_urls=int(web_search.get("max_crawl_urls", 5)),
            ),
            metaclaw_bridge=_parse_metaclaw_bridge_config(metaclaw),
            memory=_parse_memory_config(memory_data),
            skills=_parse_skills_config(skills_data),
            knowledge_graph=_parse_knowledge_graph_config(knowledge_graph_data),
            multi_project=_parse_multi_project_config(multi_project),
            compute_servers=_parse_servers_config(compute_servers),
            mcp=_parse_mcp_config(mcp_data),
            overleaf=_parse_overleaf_config(overleaf),
            server=_parse_server_config(server),
            dashboard=_parse_dashboard_config(dashboard_data),
            trends=_parse_trends_config(trends_data),
            copilot=_parse_copilot_config(copilot_data),
            quality_assessor=_parse_quality_assessor_config(quality_assessor_data),
            calendar=_parse_calendar_config(calendar_data),
        )

    @classmethod
    def load(
        cls,
        path: str | Path,
        *,
        project_root: str | Path | None = None,
        check_paths: bool = True,
    ) -> RCConfig:
        config_path = Path(path).expanduser().resolve()
        with config_path.open(encoding="utf-8") as handle:
            data = yaml.safe_load(handle) or {}
        if not isinstance(data, dict):
            raise ValueError(
                f"Config root must be a mapping, got {type(data).__name__}. "
                f"Check that {config_path} is valid YAML."
            )
        resolved_root = (
            Path(project_root).expanduser().resolve()
            if project_root
            else config_path.parent
        )
        return cls.from_dict(data, project_root=resolved_root, check_paths=check_paths)


def validate_config(
    data: dict[str, Any],
    *,
    project_root: Path | None = None,
    check_paths: bool = True,
) -> ValidationResult:
    errors: list[str] = []
    warnings: list[str] = []

    llm_provider = _get_by_path(data, "llm.provider")
    for key in REQUIRED_FIELDS:
        # ACP provider doesn't need base_url or api_key_env
        if llm_provider == "acp" and key in ("llm.base_url", "llm.api_key_env"):
            continue
        value = _get_by_path(data, key)
        if _is_blank(value):
            errors.append(f"Missing required field: {key}")

    project_mode = _get_by_path(data, "project.mode")
    if not _is_blank(project_mode) and project_mode not in PROJECT_MODES:
        errors.append(f"Invalid project.mode: {project_mode}")

    kb_backend = _get_by_path(data, "knowledge_base.backend")
    if not _is_blank(kb_backend) and kb_backend not in KB_BACKENDS:
        errors.append(f"Invalid knowledge_base.backend: {kb_backend}")

    hitl_required_stages = _get_by_path(data, "security.hitl_required_stages")
    if hitl_required_stages is not None:
        if not isinstance(hitl_required_stages, list):
            errors.append("security.hitl_required_stages must be a list")
        else:
            for stage in hitl_required_stages:
                if not isinstance(stage, int) or not 1 <= stage <= 23:
                    errors.append(
                        f"Invalid security.hitl_required_stages entry: {stage}"
                    )

    exp_mode = _get_by_path(data, "experiment.mode")
    if not _is_blank(exp_mode) and exp_mode not in EXPERIMENT_MODES:
        errors.append(f"Invalid experiment.mode: {exp_mode}")

    exp_direction = _get_by_path(data, "experiment.metric_direction")
    if not _is_blank(exp_direction) and exp_direction not in ("minimize", "maximize"):
        errors.append(f"Invalid experiment.metric_direction: {exp_direction}")

    cli_agent_provider = _get_by_path(data, "experiment.cli_agent.provider")
    if not _is_blank(cli_agent_provider) and cli_agent_provider not in CLI_AGENT_PROVIDERS:
        errors.append(f"Invalid experiment.cli_agent.provider: {cli_agent_provider}")

    kb_root_raw = _get_by_path(data, "knowledge_base.root")
    if check_paths and not _is_blank(kb_root_raw) and project_root is not None:
        kb_root = project_root / str(kb_root_raw)
        if not kb_root.exists():
            errors.append(f"Missing path: {kb_root}")
        else:
            for subdir in KB_SUBDIRS:
                candidate = kb_root / subdir
                if not candidate.exists():
                    warnings.append(f"Missing recommended kb subdir: {candidate}")

    return ValidationResult(
        ok=not errors, errors=tuple(errors), warnings=tuple(warnings)
    )


def _parse_llm_config(data: dict[str, Any]) -> LlmConfig:
    acp_data = data.get("acp") or {}
    return LlmConfig(
        provider=data.get("provider", "openai-compatible"),
        base_url=data.get("base_url", ""),
        api_key_env=data.get("api_key_env", ""),
        api_key=data.get("api_key", ""),
        primary_model=data.get("primary_model", ""),
        fallback_models=tuple(data.get("fallback_models") or ()),
        s2_api_key=data.get("s2_api_key", ""),
        notes=data.get("notes", ""),
        acp=AcpConfig(
            agent=acp_data.get("agent", "claude"),
            cwd=acp_data.get("cwd", "."),
            acpx_command=acp_data.get("acpx_command", ""),
            session_name=acp_data.get("session_name", "researchclaw"),
            timeout_sec=int(acp_data.get("timeout_sec", 1800)),
        ),
    )


def _parse_agentic_config(data: dict[str, Any]) -> AgenticConfig:
    if not data:
        return AgenticConfig()
    return AgenticConfig(
        image=data.get("image", "researchclaw/experiment:latest"),
        agent_cli=data.get("agent_cli", "claude"),
        agent_install_cmd=data.get(
            "agent_install_cmd", "npm install -g @anthropic-ai/claude-code"
        ),
        network_policy=data.get("network_policy", "full"),
        timeout_sec=int(data.get("timeout_sec", 1800)),
        memory_limit_mb=int(data.get("memory_limit_mb", 8192)),
        gpu_enabled=bool(data.get("gpu_enabled", False)),
        mount_skills=bool(data.get("mount_skills", True)),
        allow_shell_commands=bool(data.get("allow_shell_commands", True)),
        max_turns=int(data.get("max_turns", 50)),
    )


def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
    sandbox_data = data.get("sandbox") or {}
    docker_data = data.get("docker") or {}
    ssh_data = data.get("ssh_remote") or {}
    colab_data = data.get("colab_drive") or {}
    return ExperimentConfig(
        mode=data.get("mode", "simulated"),
        time_budget_sec=_safe_int(data.get("time_budget_sec"), 300),
        max_iterations=_safe_int(data.get("max_iterations"), 10),
        max_refine_duration_sec=_safe_int(data.get("max_refine_duration_sec"), 0),
        metric_key=data.get("metric_key", "primary_metric"),
        metric_direction=data.get("metric_direction", "minimize"),
        keep_threshold=_safe_float(data.get("keep_threshold"), 0.0),
        sandbox=SandboxConfig(
            python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH),
            gpu_required=bool(sandbox_data.get("gpu_required", False)),
            allowed_imports=tuple(
                sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports)
            ),
            max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096),
        ),
        docker=DockerSandboxConfig(
            image=docker_data.get("image", "researchclaw/experiment:latest"),
            gpu_enabled=bool(docker_data.get("gpu_enabled", True)),
            gpu_device_ids=tuple(
                int(g) for g in docker_data.get("gpu_device_ids", ())
            ),
            memory_limit_mb=_safe_int(docker_data.get("memory_limit_mb"), 8192),
            network_policy=_validate_network_policy(
                docker_data.get("network_policy", "setup_only"),
            ),
            pip_pre_install=tuple(docker_data.get("pip_pre_install", ())),
            auto_install_deps=bool(docker_data.get("auto_install_deps", True)),
            shm_size_mb=_safe_int(docker_data.get("shm_size_mb"), 2048),
            container_python=docker_data.get("container_python", "/usr/bin/python3"),
            keep_containers=bool(docker_data.get("keep_containers", False)),
        ),
        ssh_remote=SshRemoteConfig(
            host=ssh_data.get("host", ""),
            user=ssh_data.get("user", ""),
            port=_safe_int(ssh_data.get("port"), 22),
            key_path=ssh_data.get("key_path", ""),
            gpu_ids=tuple(int(g) for g in ssh_data.get("gpu_ids", ())),
            remote_workdir=ssh_data.get(
                "remote_workdir", "/tmp/researchclaw_experiments"
            ),
            remote_python=ssh_data.get("remote_python", "python3"),
            setup_commands=tuple(ssh_data.get("setup_commands") or ()),
            use_docker=bool(ssh_data.get("use_docker", False)),
            docker_image=ssh_data.get("docker_image", "researchclaw/experiment:latest"),
            docker_network_policy=_validate_network_policy(
                ssh_data.get("docker_network_policy", "none"),
            ),
            docker_memory_limit_mb=_safe_int(ssh_data.get("docker_memory_limit_mb"), 8192),
            docker_shm_size_mb=_safe_int(ssh_data.get("docker_shm_size_mb"), 2048),
            timeout_sec=_safe_int(ssh_data.get("timeout_sec"), 600),
            scp_timeout_sec=_safe_int(ssh_data.get("scp_timeout_sec"), 300),
            setup_timeout_sec=_safe_int(ssh_data.get("setup_timeout_sec"), 300),
        ),
        colab_drive=ColabDriveConfig(
            drive_root=colab_data.get("drive_root", ""),
            poll_interval_sec=_safe_int(colab_data.get("poll_interval_sec"), 30),
            timeout_sec=_safe_int(colab_data.get("timeout_sec"), 3600),
            setup_script=colab_data.get("setup_script", ""),
        ),
        agentic=_parse_agentic_config(data.get("agentic") or {}),
        code_agent=_parse_code_agent_config(data.get("code_agent") or {}),
        opencode=_parse_opencode_config(data.get("opencode") or {}),
        benchmark_agent=_parse_benchmark_agent_config(
            data.get("benchmark_agent") or {}
        ),
        figure_agent=_parse_figure_agent_config(data.get("figure_agent") or {}),
        repair=_parse_experiment_repair_config(data.get("repair") or {}),
        cli_agent=_parse_cli_agent_config(data.get("cli_agent") or {}),
    )


def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig:
    if not data:
        return BenchmarkAgentConfig()
    return BenchmarkAgentConfig(
        enabled=bool(data.get("enabled", True)),
        enable_hf_search=bool(data.get("enable_hf_search", True)),
        max_hf_results=_safe_int(data.get("max_hf_results"), 10),
        enable_web_search=bool(data.get("enable_web_search", True)),
        max_web_results=_safe_int(data.get("max_web_results"), 5),
        web_search_min_local=_safe_int(data.get("web_search_min_local"), 3),
        tier_limit=_safe_int(data.get("tier_limit"), 2),
        min_benchmarks=_safe_int(data.get("min_benchmarks"), 1),
        min_baselines=_safe_int(data.get("min_baselines"), 2),
        prefer_cached=bool(data.get("prefer_cached", True)),
        max_iterations=_safe_int(data.get("max_iterations"), 2),
    )


def _parse_figure_agent_config(data: dict[str, Any]) -> FigureAgentConfig:
    if not data:
        return FigureAgentConfig()
    use_docker_raw = data.get("use_docker", None)
    return FigureAgentConfig(
        enabled=bool(data.get("enabled", True)),
        min_figures=_safe_int(data.get("min_figures"), 3),
        max_figures=_safe_int(data.get("max_figures"), 8),
        max_iterations=_safe_int(data.get("max_iterations"), 3),
        render_timeout_sec=_safe_int(data.get("render_timeout_sec"), 30),
        use_docker=(
            None if use_docker_raw is None else bool(use_docker_raw)
        ),
        docker_image=data.get("docker_image", "researchclaw/experiment:latest"),
        output_format=data.get("output_format", "python"),
        gemini_api_key=data.get("gemini_api_key", ""),
        gemini_model=data.get("gemini_model", "gemini-2.5-flash-image"),
        nano_banana_enabled=bool(data.get("nano_banana_enabled", True)),
        strict_mode=bool(data.get("strict_mode", False)),
        dpi=_safe_int(data.get("dpi"), 300),
    )


def _parse_experiment_repair_config(data: dict[str, Any]) -> ExperimentRepairConfig:
    if not data:
        return ExperimentRepairConfig()
    return ExperimentRepairConfig(
        enabled=bool(data.get("enabled", True)),
        max_cycles=_safe_int(data.get("max_cycles"), 3),
        min_completion_rate=_safe_float(data.get("min_completion_rate"), 0.5),
        min_conditions=_safe_int(data.get("min_conditions"), 2),
        use_opencode=bool(data.get("use_opencode", True)),
        timeout_sec_per_cycle=_safe_int(data.get("timeout_sec_per_cycle"), 600),
    )


def _parse_cli_agent_config(data: dict[str, Any]) -> CliAgentConfig:
    if not data:
        return CliAgentConfig()
    return CliAgentConfig(
        provider=data.get("provider", "llm"),
        binary_path=data.get("binary_path", ""),
        model=data.get("model", ""),
        max_budget_usd=_safe_float(data.get("max_budget_usd"), 5.0),
        timeout_sec=_safe_int(data.get("timeout_sec"), 600),
        extra_args=tuple(data.get("extra_args") or ()),
    )


def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig:
    if not data:
        return CodeAgentConfig()
    return CodeAgentConfig(
        enabled=bool(data.get("enabled", True)),
        architecture_planning=bool(data.get("architecture_planning", True)),
        sequential_generation=bool(data.get("sequential_generation", True)),
        hard_validation=bool(data.get("hard_validation", True)),
        hard_validation_max_repairs=_safe_int(data.get("hard_validation_max_repairs"), 4),
        exec_fix_max_iterations=_safe_int(data.get("exec_fix_max_iterations"), 3),
        exec_fix_timeout_sec=_safe_int(data.get("exec_fix_timeout_sec"), 60),
        tree_search_enabled=bool(data.get("tree_search_enabled", False)),
        tree_search_candidates=_safe_int(data.get("tree_search_candidates"), 3),
        tree_search_max_depth=_safe_int(data.get("tree_search_max_depth"), 2),
        tree_search_eval_timeout_sec=_safe_int(
            data.get("tree_search_eval_timeout_sec"), 120
        ),
        review_max_rounds=_safe_int(data.get("review_max_rounds"), 2),
    )


def _parse_opencode_config(data: dict[str, Any]) -> OpenCodeConfig:
    if not data:
        return OpenCodeConfig()
    return OpenCodeConfig(
        enabled=bool(data.get("enabled", True)),
        auto=bool(data.get("auto", True)),
        complexity_threshold=_safe_float(data.get("complexity_threshold"), 0.2),
        model=str(data.get("model", "")),
        timeout_sec=_safe_int(data.get("timeout_sec"), 600),
        max_retries=_safe_int(data.get("max_retries"), 1),
        workspace_cleanup=bool(data.get("workspace_cleanup", True)),
    )


def _parse_metaclaw_bridge_config(data: dict[str, Any]) -> MetaClawBridgeConfig:
    prm_data = data.get("prm") or {}
    l2s_data = data.get("lesson_to_skill") or {}
    return MetaClawBridgeConfig(
        enabled=bool(data.get("enabled", False)),
        proxy_url=data.get("proxy_url", "http://localhost:30000"),
        skills_dir=data.get("skills_dir", "~/.metaclaw/skills"),
        fallback_url=data.get("fallback_url", ""),
        fallback_api_key=data.get("fallback_api_key", ""),
        prm=MetaClawPRMConfig(
            enabled=bool(prm_data.get("enabled", False)),
            api_base=prm_data.get("api_base", ""),
            api_key_env=prm_data.get("api_key_env", ""),
            api_key=prm_data.get("api_key", ""),
            model=prm_data.get("model", "gpt-5.4"),
            votes=_safe_int(prm_data.get("votes"), 3),
            temperature=_safe_float(prm_data.get("temperature"), 0.6),
            gate_stages=tuple(
                int(s) for s in prm_data.get("gate_stages", (5, 9, 15, 20))
            ),
        ),
        lesson_to_skill=MetaClawLessonToSkillConfig(
            enabled=bool(l2s_data.get("enabled", True)),
            min_severity=l2s_data.get("min_severity", "warning"),
            max_skills_per_run=_safe_int(l2s_data.get("max_skills_per_run"), 3),
        ),
    )


def _parse_memory_config(data: dict[str, Any]) -> MemoryConfig:
    if not data:
        return MemoryConfig()
    stages = data.get("inject_at_stages", (1, 9, 10, 17))
    return MemoryConfig(
        enabled=bool(data.get("enabled", True)),
        store_dir=str(data.get("store_dir", ".researchclaw/memory")),
        embedding_model=str(data.get("embedding_model", "text-embedding-3-small")),
        max_entries_per_category=int(data.get("max_entries_per_category", 500)),
        decay_half_life_days=int(data.get("decay_half_life_days", 90)),
        confidence_threshold=float(data.get("confidence_threshold", 0.3)),
        inject_at_stages=tuple(int(s) for s in stages),
    )


def _parse_skills_config(data: dict[str, Any]) -> SkillsConfig:
    if not data:
        return SkillsConfig()
    return SkillsConfig(
        enabled=bool(data.get("enabled", True)),
        builtin_dir=str(data.get("builtin_dir", "")),
        custom_dirs=tuple(str(d) for d in (data.get("custom_dirs") or ())),
        external_dirs=tuple(str(d) for d in (data.get("external_dirs") or ())),
        auto_match=bool(data.get("auto_match", True)),
        max_skills_per_stage=int(data.get("max_skills_per_stage", 3)),
        fallback_matching=bool(data.get("fallback_matching", True)),
    )


def _parse_knowledge_graph_config(data: dict[str, Any]) -> KnowledgeGraphConfig:
    if not data:
        return KnowledgeGraphConfig()
    return KnowledgeGraphConfig(
        enabled=bool(data.get("enabled", False)),
        store_path=str(data.get("store_path", ".researchclaw/knowledge_graph")),
        max_entities=int(data.get("max_entities", 10000)),
        auto_update=bool(data.get("auto_update", True)),
    )


def _parse_multi_project_config(data: dict[str, Any]) -> MultiProjectConfig:
    if not data:
        return MultiProjectConfig()
    return MultiProjectConfig(
        enabled=bool(data.get("enabled", False)),
        projects_dir=data.get("projects_dir", ".researchclaw/projects"),
        max_concurrent=int(data.get("max_concurrent", 2)),
        shared_knowledge=bool(data.get("shared_knowledge", True)),
    )


def _parse_servers_config(data: dict[str, Any]) -> ServersConfig:
    if not data:
        return ServersConfig()
    raw_servers = data.get("servers") or ()
    servers = tuple(
        ServerEntryConfig(
            name=s.get("name", ""),
            host=s.get("host", ""),
            server_type=s.get("server_type", "ssh"),
            gpu=s.get("gpu", ""),
            vram_gb=int(s.get("vram_gb", 0)),
            priority=int(s.get("priority", 1)),
            cost_per_hour=float(s.get("cost_per_hour", 0.0)),
            scheduler=s.get("scheduler", ""),
            cloud_provider=s.get("cloud_provider", ""),
        )
        for s in raw_servers
    )
    return ServersConfig(
        enabled=bool(data.get("enabled", False)),
        servers=servers,
        prefer_free=bool(data.get("prefer_free", True)),
        failover=bool(data.get("failover", True)),
        monitor_interval_sec=int(data.get("monitor_interval_sec", 60)),
    )


def _parse_mcp_config(data: dict[str, Any]) -> MCPIntegrationConfig:
    if not data:
        return MCPIntegrationConfig()
    return MCPIntegrationConfig(
        server_enabled=bool(data.get("server_enabled", False)),
        server_port=int(data.get("server_port", 3000)),
        server_transport=data.get("server_transport", "stdio"),
        external_servers=tuple(data.get("external_servers") or ()),
    )


def _parse_overleaf_config(data: dict[str, Any]) -> OverleafConfig:
    if not data:
        return OverleafConfig()
    return OverleafConfig(
        enabled=bool(data.get("enabled", False)),
        git_url=data.get("git_url", ""),
        branch=data.get("branch", "main"),
        auto_push=bool(data.get("auto_push", True)),
        auto_pull=bool(data.get("auto_pull", False)),
        poll_interval_sec=int(data.get("poll_interval_sec", 300)),
    )


def _parse_server_config(data: dict[str, Any]) -> ServerConfig:
    if not data:
        return ServerConfig()
    cors = data.get("cors_origins")
    if isinstance(cors, list):
        cors = tuple(cors)
    elif cors is None:
        cors = ("*",)
    else:
        cors = (str(cors),)
    return ServerConfig(
        enabled=bool(data.get("enabled", False)),
        host=data.get("host", "0.0.0.0"),
        port=int(data.get("port", 8080)),
        cors_origins=cors,
        auth_token=data.get("auth_token", ""),
        voice_enabled=bool(data.get("voice_enabled", False)),
        whisper_model=data.get("whisper_model", "whisper-1"),
        whisper_api_url=data.get("whisper_api_url", ""),
    )


def _parse_dashboard_config(data: dict[str, Any]) -> DashboardConfig:
    if not data:
        return DashboardConfig()
    return DashboardConfig(
        enabled=bool(data.get("enabled", True)),
        refresh_interval_sec=int(data.get("refresh_interval_sec", 5)),
        max_log_lines=int(data.get("max_log_lines", 1000)),
        browser_notifications=bool(data.get("browser_notifications", True)),
    )


def _parse_trends_config(data: dict[str, Any]) -> TrendsConfig:
    if not data:
        return TrendsConfig()
    sources = data.get("sources", ("arxiv", "semantic_scholar"))
    if isinstance(sources, list):
        sources = tuple(sources)
    domains = data.get("domains", ())
    if isinstance(domains, list):
        domains = tuple(domains)
    return TrendsConfig(
        enabled=bool(data.get("enabled", False)),
        domains=domains,
        daily_digest=bool(data.get("daily_digest", True)),
        digest_time=data.get("digest_time", "08:00"),
        max_papers_per_day=int(data.get("max_papers_per_day", 20)),
        trend_window_days=int(data.get("trend_window_days", 30)),
        sources=sources,
    )


def _parse_copilot_config(data: dict[str, Any]) -> CoPilotConfig:
    if not data:
        return CoPilotConfig()
    return CoPilotConfig(
        mode=data.get("mode", "auto-pilot"),
        pause_at_gates=bool(data.get("pause_at_gates", True)),
        pause_at_every_stage=bool(data.get("pause_at_every_stage", False)),
        feedback_timeout_sec=int(data.get("feedback_timeout_sec", 3600)),
        allow_branching=bool(data.get("allow_branching", True)),
        max_branches=int(data.get("max_branches", 3)),
    )


def _parse_quality_assessor_config(data: dict[str, Any]) -> QualityAssessorConfig:
    if not data:
        return QualityAssessorConfig()
    dimensions = data.get("dimensions", ("novelty", "rigor", "clarity", "impact", "experiments"))
    if isinstance(dimensions, list):
        dimensions = tuple(dimensions)
    return QualityAssessorConfig(
        enabled=bool(data.get("enabled", True)),
        dimensions=dimensions,
        venue_recommendation=bool(data.get("venue_recommendation", True)),
        score_history=bool(data.get("score_history", True)),
    )


def _parse_calendar_config(data: dict[str, Any]) -> CalendarConfig:
    if not data:
        return CalendarConfig()
    venues = data.get("target_venues", ())
    if isinstance(venues, list):
        venues = tuple(venues)
    reminder = data.get("reminder_days_before", (30, 14, 7, 3, 1))
    if isinstance(reminder, list):
        reminder = tuple(reminder)
    return CalendarConfig(
        enabled=bool(data.get("enabled", False)),
        target_venues=venues,
        reminder_days_before=reminder,
        auto_plan=bool(data.get("auto_plan", True)),
    )


def load_config(
    path: str | Path,
    *,
    project_root: str | Path | None = None,
    check_paths: bool = True,
) -> RCConfig:
    return RCConfig.load(path, project_root=project_root, check_paths=check_paths)


================================================
FILE: researchclaw/copilot/__init__.py
================================================
"""Interactive Co-Pilot mode for human-AI research collaboration."""

from researchclaw.copilot.modes import ResearchMode
from researchclaw.copilot.controller import CoPilotController
from researchclaw.copilot.feedback import FeedbackHandler
from researchclaw.copilot.branching import BranchManager

__all__ = [
    "BranchManager",
    "CoPilotController",
    "FeedbackHandler",
    "ResearchMode",
]


================================================
FILE: researchclaw/copilot/branching.py
================================================
"""Exploration branch management for Co-Pilot mode."""

from __future__ import annotations

import json
import logging
import shutil
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


class BranchManager:
    """Manage exploration branches during Co-Pilot sessions."""

    def __init__(self, run_dir: Path, max_branches: int = 3):
        self.run_dir = run_dir
        self.max_branches = max_branches
        self._branches_dir = run_dir / "branches"

    def create_branch(
        self,
        name: str,
        from_stage: int,
    ) -> str:
        """Create a new exploration branch by copying state up to from_stage."""
        if len(self.list_branches()) >= self.max_branches:
            raise ValueError(
                f"Maximum branches ({self.max_branches}) reached. "
                f"Delete a branch before creating a new one."
            )

        branch_dir = self._branches_dir / name
        if branch_dir.exists():
            raise ValueError(f"Branch '{name}' already exists.")

        branch_dir.mkdir(parents=True, exist_ok=True)

        # Copy stage directories up to from_stage
        for stage_num in range(1, from_stage + 1):
            src = self.run_dir / f"stage-{stage_num:02d}"
            if src.exists():
                dest = branch_dir / f"stage-{stage_num:02d}"
                shutil.copytree(src, dest, dirs_exist_ok=True)

        # Write branch metadata
        meta = {
            "name": name,
            "from_stage": from_stage,
            "created_at": datetime.now(timezone.utc).isoformat(),
            "status": "active",
        }
        (branch_dir / "branch_meta.json").write_text(
            json.dumps(meta, indent=2), encoding="utf-8"
        )

        logger.info("Created branch '%s' from stage %d", name, from_stage)
        return str(branch_dir)

    def list_branches(self) -> list[dict[str, Any]]:
        """List all branches with their metadata."""
        if not self._branches_dir.exists():
            return []

        branches = []
        for branch_dir in sorted(self._branches_dir.iterdir()):
            if not branch_dir.is_dir():
                continue
            meta_path = branch_dir / "branch_meta.json"
            if meta_path.exists():
                try:
                    meta = json.loads(meta_path.read_text(encoding="utf-8"))
                    meta["path"] = str(branch_dir)
                    branches.append(meta)
                except (json.JSONDecodeError, OSError):
                    branches.append({
                        "name": branch_dir.name,
                        "path": str(branch_dir),
                        "status": "unknown",
                    })
            else:
                branches.append({
                    "name": branch_dir.name,
                    "path": str(branch_dir),
                    "status": "unknown",
                })
        return branches

    def switch_branch(self, name: str) -> Path:
        """Get the directory path for a branch (for resuming execution)."""
        branch_dir = self._branches_dir / name
        if not branch_dir.exists():
            raise ValueError(f"Branch '{name}' does not exist.")
        return branch_dir

    def delete_branch(self, name: str) -> None:
        """Delete a branch and its data."""
        branch_dir = self._branches_dir / name
        if not branch_dir.exists():
            raise ValueError(f"Branch '{name}' does not exist.")
        shutil.rmtree(branch_dir)
        logger.info("Deleted branch '%s'", name)

    def compare_branches(
        self,
        branch_a: str,
        branch_b: str,
    ) -> dict[str, Any]:
        """Compare results between two branches."""
        dir_a = self._branches_dir / branch_a
        dir_b = self._branches_dir / branch_b

        if not dir_a.exists():
            return {"error": f"Branch '{branch_a}' does not exist."}
        if not dir_b.exists():
            return {"error": f"Branch '{branch_b}' does not exist."}

        # Compare experiment summaries if available
        result: dict[str, Any] = {
            "branch_a": branch_a,
            "branch_b": branch_b,
            "stages_a": self._count_stages(dir_a),
            "stages_b": self._count_stages(dir_b),
        }

        summary_a = self._read_experiment_summary(dir_a)
        summary_b = self._read_experiment_summary(dir_b)

        if summary_a and summary_b:
            result["metrics_a"] = summary_a.get("metrics_summary", {})
            result["metrics_b"] = summary_b.get("metrics_summary", {})

        return result

    @staticmethod
    def _count_stages(branch_dir: Path) -> int:
        """Count completed stages in a branch directory."""
        count = 0
        for d in branch_dir.iterdir():
            if d.is_dir() and d.name.startswith("stage-"):
                count += 1
        return count

    @staticmethod
    def _read_experiment_summary(
        branch_dir: Path,
    ) -> dict[str, Any] | None:
        """Read experiment summary from a branch."""
        summary_path = branch_dir / "stage-14" / "experiment_summary.json"
        if not summary_path.exists():
            return None
        try:
            return json.loads(summary_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            return None


================================================
FILE: researchclaw/copilot/controller.py
================================================
"""Co-Pilot controller — orchestrates pause/feedback/branch logic."""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

from researchclaw.config import CoPilotConfig
from researchclaw.copilot.branching import BranchManager
from researchclaw.copilot.feedback import Feedback, FeedbackHandler
from researchclaw.copilot.modes import ResearchMode
from researchclaw.pipeline.stages import GATE_STAGES

logger = logging.getLogger(__name__)


class CoPilotController:
    """Control Co-Pilot mode during pipeline execution."""

    def __init__(
        self,
        config: CoPilotConfig,
        run_dir: Path,
    ):
        self.config = config
        self.mode = ResearchMode(config.mode)
        self.run_dir = run_dir
        self.feedback_handler = FeedbackHandler(run_dir)
        self.branch_manager = BranchManager(
            run_dir, max_branches=config.max_branches
        )

    def should_pause(self, stage_num: int, is_gate: bool) -> bool:
        """Determine if the pipeline should pause at this stage."""
        if self.mode == ResearchMode.ZERO_TOUCH:
            return False
        if self.mode == ResearchMode.AUTO_PILOT:
            return is_gate and self.config.pause_at_gates
        # CO_PILOT mode
        if self.config.pause_at_every_stage:
            return True
        return is_gate

    def present_stage_result(
        self,
        stage_num: int,
        stage_name: str,
        artifacts: list[str],
        status: str,
        error: str | None = None,
    ) -> str:
        """Format stage result summary for user review."""
        lines = [
            f"Stage {stage_num}: {stage_name}",
            f"Status: {status}",
        ]

        if error:
            lines.append(f"Error: {error}")

        if artifacts:
            lines.append(f"Artifacts: {', '.join(artifacts)}")

        lines.extend([
            "",
            "Available actions: approve, modify, retry, skip, branch, rollback",
        ])

        return "\n".join(lines)

    def request_feedback(
        self,
        stage_num: int,
        stage_name: str,
        summary: str,
    ) -> Feedback | None:
        """Request and wait for user feedback."""
        self.feedback_handler.write_feedback_request(
            stage=stage_num,
            stage_name=stage_name,
            summary=summary,
        )

        logger.info(
            "Co-Pilot: waiting for feedback on stage %d (%s)",
            stage_num,
            stage_name,
        )

        feedback = self.feedback_handler.wait_for_feedback(
            stage=stage_num,
            timeout_sec=self.config.feedback_timeout_sec,
        )

        self.feedback_handler.clear_request()
        return feedback

    def handle_feedback(
        self,
        feedback: Feedback,
    ) -> dict[str, Any]:
        """Process user feedback and return action instructions."""
        result: dict[str, Any] = {
            "action": feedback.action,
            "stage": feedback.stage,
        }

        if feedback.action == "approve":
            result["instruction"] = "continue"

        elif feedback.action == "modify":
            result["instruction"] = "apply_modifications"
            result["modifications"] = feedback.modifications or {}
            result["message"] = feedback.message

        elif feedback.action == "retry":
            result["instruction"] = "rerun_stage"

        elif feedback.action == "skip":
            result["instruction"] = "skip_stage"

        elif feedback.action == "branch":
            if self.config.allow_branching:
                branch_name = feedback.branch_name or f"branch_{feedback.stage}"
                try:
                    branch_path = self.branch_manager.create_branch(
                        branch_name, feedback.stage
                    )
                    result["instruction"] = "branch_created"
                    result["branch_name"] = branch_name
                    result["branch_path"] = branch_path
                except ValueError as exc:
                    result["instruction"] = "branch_failed"
                    result["error"] = str(exc)
            else:
                result["instruction"] = "branching_disabled"

        elif feedback.action == "rollback":
            result["instruction"] = "rollback"
            result["rollback_to"] = feedback.rollback_to

        else:
            result["instruction"] = "continue"

        return result

    @classmethod
    def from_config(
        cls,
        config: CoPilotConfig,
        run_dir: Path,
    ) -> CoPilotController | None:
        """Create a controller, or None if mode is zero-touch."""
        mode = ResearchMode(config.mode)
        if mode == ResearchMode.ZERO_TOUCH:
            return None
        return cls(config, run_dir)


================================================
FILE: researchclaw/copilot/feedback.py
================================================
"""User feedback processing for Co-Pilot mode."""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


FEEDBACK_ACTIONS = frozenset({
    "approve",   # Continue to next stage
    "modify",    # Apply modifications and continue
    "retry",     # Re-run current stage
    "skip",      # Skip current stage
    "discuss",   # Enter discussion mode (future)
    "branch",    # Create exploration branch
    "rollback",  # Roll back to a previous stage
})


@dataclass(frozen=True)
class Feedback:
    """Structured user feedback for a pipeline stage."""

    action: str
    stage: int
    message: str = ""
    modifications: dict[str, Any] | None = None
    branch_name: str = ""
    rollback_to: int | None = None
    timestamp: str = ""


class FeedbackHandler:
    """Handle feedback input/output for Co-Pilot mode."""

    def __init__(self, run_dir: Path):
        self.run_dir = run_dir

    def write_feedback_request(
        self,
        stage: int,
        stage_name: str,
        summary: str,
        options: list[str] | None = None,
    ) -> Path:
        """Write a feedback request file for external consumers."""
        from datetime import datetime, timezone

        request = {
            "stage": stage,
            "stage_name": stage_name,
            "summary": summary,
            "options": options or list(FEEDBACK_ACTIONS),
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "status": "waiting",
        }
        request_path = self.run_dir / "copilot_feedback_request.json"
        request_path.write_text(
            json.dumps(request, indent=2), encoding="utf-8"
        )
        return request_path

    def read_feedback_response(self) -> Feedback | None:
        """Read feedback response from file (written by UI or CLI)."""
        response_path = self.run_dir / "copilot_feedback_response.json"
        if not response_path.exists():
            return None
        try:
            data = json.loads(response_path.read_text(encoding="utf-8"))
            action = data.get("action", "approve")
            if action not in FEEDBACK_ACTIONS:
                logger.warning("Invalid feedback action: %s", action)
                return None
            return Feedback(
                action=action,
                stage=int(data.get("stage", 0)),
                message=str(data.get("message", "")),
                modifications=data.get("modifications"),
                branch_name=str(data.get("branch_name", "")),
                rollback_to=(
                    int(data["rollback_to"])
                    if data.get("rollback_to") is not None
                    else None
                ),
                timestamp=str(data.get("timestamp", "")),
            )
        except (json.JSONDecodeError, TypeError, ValueError) as exc:
            logger.warning("Failed to parse feedback response: %s", exc)
            return None

    def wait_for_feedback(
        self,
        stage: int,
        timeout_sec: int = 3600,
        poll_interval_sec: float = 1.0,
    ) -> Feedback | None:
        """Wait for feedback by polling the response file."""
        response_path = self.run_dir / "copilot_feedback_response.json"

        # Clear any stale response
        if response_path.exists():
            response_path.unlink()

        deadline = time.monotonic() + timeout_sec
        while time.monotonic() < deadline:
            feedback = self.read_feedback_response()
            if feedback is not None and feedback.stage == stage:
                # Clean up response file
                if response_path.exists():
                    response_path.unlink()
                return feedback
            time.sleep(poll_interval_sec)

        logger.info("Feedback timeout for stage %d after %ds", stage, timeout_sec)
        return None

    def clear_request(self) -> None:
        """Clear the feedback request file."""
        request_path = self.run_dir / "copilot_feedback_request.json"
        if request_path.exists():
            request_path.unlink()


================================================
FILE: researchclaw/copilot/modes.py
================================================
"""Research mode definitions for Co-Pilot."""

from __future__ import annotations

from enum import Enum


class ResearchMode(Enum):
    """Pipeline execution modes."""

    CO_PILOT = "co-pilot"        # Pause at every stage for feedback
    AUTO_PILOT = "auto-pilot"    # Pause only at gate stages
    ZERO_TOUCH = "zero-touch"    # Fully automatic, no pauses


================================================
FILE: researchclaw/dashboard/__init__.py
================================================
"""Real-time pipeline monitoring dashboard."""


================================================
FILE: researchclaw/dashboard/broadcaster.py
================================================
"""Dashboard state broadcaster — pushes updates via WebSocket."""

from __future__ import annotations

import asyncio
import logging
from typing import Any

from researchclaw.dashboard.collector import DashboardCollector, RunSnapshot
from researchclaw.server.websocket.events import Event, EventType
from researchclaw.server.websocket.manager import ConnectionManager

logger = logging.getLogger(__name__)


class DashboardBroadcaster:
    """Periodically collect run data and broadcast changes."""

    def __init__(
        self,
        manager: ConnectionManager,
        collector: DashboardCollector,
    ) -> None:
        self._manager = manager
        self._collector = collector
        self._prev_snapshots: dict[str, dict[str, Any]] = {}

    async def tick(self) -> None:
        """Collect current state and broadcast changes."""
        snapshots = self._collector.collect_all()
        current: dict[str, dict[str, Any]] = {}

        for snap in snapshots:
            d = snap.to_dict()
            current[snap.run_id] = d

            prev = self._prev_snapshots.get(snap.run_id)
            if prev is None:
                # New run discovered
                await self._manager.broadcast(
                    Event(type=EventType.RUN_DISCOVERED, data=d)
                )
            else:
                # Check for stage changes
                if d["current_stage"] != prev.get("current_stage"):
                    await self._manager.broadcast(
                        Event(
                            type=EventType.STAGE_COMPLETE
                            if d["current_stage"] > prev.get("current_stage", 0)
                            else EventType.RUN_STATUS_CHANGED,
                            data=d,
                        )
                    )
                elif d["status"] != prev.get("status"):
                    await self._manager.broadcast(
                        Event(type=EventType.RUN_STATUS_CHANGED, data=d)
                    )
                # Check for metric updates
                if d["metrics"] and d["metrics"] != prev.get("metrics"):
                    await self._manager.broadcast(
                        Event(
                            type=EventType.METRIC_UPDATE,
                            data={"run_id": snap.run_id, "metrics": d["metrics"]},
                        )
                    )

        self._prev_snapshots = current


async def start_dashboard_loop(
    manager: ConnectionManager,
    interval: int = 5,
    monitor_dir: str | None = None,
) -> None:
    """Background task that periodically broadcasts dashboard updates."""
    collector = DashboardCollector()
    broadcaster = DashboardBroadcaster(manager, collector)

    while True:
        try:
            await broadcaster.tick()
        except Exception:
            logger.exception("Dashboard broadcast error")
        await asyncio.sleep(interval)


================================================
FILE: researchclaw/dashboard/collector.py
================================================
"""Run data collector — scans artifacts/ for pipeline state."""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class RunSnapshot:
    """Point-in-time snapshot of a pipeline run."""

    run_id: str
    path: str
    status: str = "unknown"
    current_stage: int = 0
    current_stage_name: str = ""
    total_stages: int = 23
    start_time: str = ""
    elapsed_sec: float = 0.0
    is_active: bool = False
    topic: str = ""
    metrics: dict[str, Any] = field(default_factory=dict)
    stages_completed: list[str] = field(default_factory=list)
    last_log_lines: list[str] = field(default_factory=list)
    error: str = ""

    def to_dict(self) -> dict[str, Any]:
        return {
            "run_id": self.run_id,
            "path": self.path,
            "status": self.status,
            "current_stage": self.current_stage,
            "current_stage_name": self.current_stage_name,
            "total_stages": self.total_stages,
            "start_time": self.start_time,
            "elapsed_sec": round(self.elapsed_sec, 1),
            "is_active": self.is_active,
            "topic": self.topic,
            "metrics": self.metrics,
            "stages_completed": self.stages_completed,
            "error": self.error,
        }


class DashboardCollector:
    """Collect run state from artifacts/ directory."""

    def __init__(
        self,
        artifacts_dir: str = "artifacts",
        max_log_lines: int = 200,
    ) -> None:
        self._artifacts = Path(artifacts_dir)
        self._max_log_lines = max_log_lines

    def collect_all(self) -> list[RunSnapshot]:
        """Scan all rc-* directories and return snapshots."""
        if not self._artifacts.exists():
            return []

        runs: list[RunSnapshot] = []
        for d in sorted(self._artifacts.iterdir(), reverse=True):
            if d.is_dir() and d.name.startswith("rc-"):
                try:
                    snap = self._collect_run(d)
                    runs.append(snap)
                except Exception as exc:
                    logger.debug("Failed to collect %s: %s", d, exc)
        return runs

    def collect_run(self, run_dir: str | Path) -> RunSnapshot:
        """Collect a single run."""
        return self._collect_run(Path(run_dir))

    def _collect_run(self, run_dir: Path) -> RunSnapshot:
        snap = RunSnapshot(run_id=run_dir.name, path=str(run_dir))

        # --- checkpoint.json ---
        ckpt_path = run_dir / "checkpoint.json"
        if ckpt_path.exists():
            try:
                with ckpt_path.open() as f:
                    ckpt = json.load(f)
                snap.current_stage = ckpt.get("stage", 0)
                snap.current_stage_name = ckpt.get("stage_name", "")
                snap.status = ckpt.get("status", "running")
                snap.topic = ckpt.get("topic", "")
                snap.start_time = ckpt.get("start_time", "")
            except Exception:
                pass

        # --- heartbeat.json ---
        hb_path = run_dir / "heartbeat.json"
        if hb_path.exists():
            try:
                with hb_path.open() as f:
                    hb = json.load(f)
                last_ts = hb.get("timestamp", 0)
                snap.is_active = (time.time() - last_ts) < 60
                if snap.is_active:
                    snap.status = "running"
            except Exception:
                pass

        # --- stage directories ---
        snap.stages_completed = sorted(
            [d.name for d in run_dir.iterdir()
             if d.is_dir() and d.name.startswith("stage-")]
        )

        # --- experiment metrics (results.json) ---
        for results_path in run_dir.rglob("results.json"):
            try:
                with results_path.open() as f:
                    snap.metrics = json.load(f)
                break
            except Exception:
                pass

        # --- last log lines ---
        log_path = run_dir / "pipeline.log"
        if log_path.exists():
            try:
                lines = log_path.read_text(errors="replace").splitlines()
                snap.last_log_lines = lines[-self._max_log_lines:]
            except Exception:
                pass

        return snap


================================================
FILE: researchclaw/dashboard/metrics.py
================================================
"""Metric aggregation and computation for the dashboard."""

from __future__ import annotations

from typing import Any


def aggregate_metrics(runs: list[dict[str, Any]]) -> dict[str, Any]:
    """Aggregate metrics across multiple runs for overview display."""
    total = len(runs)
    active = sum(1 for r in runs if r.get("is_active"))
    completed = sum(1 for r in runs if r.get("status") == "completed")
    failed = sum(1 for r in runs if r.get("status") == "failed")

    avg_stages = 0.0
    if total > 0:
        avg_stages = sum(r.get("current_stage", 0) for r in runs) / total

    return {
        "total_runs": total,
        "active_runs": active,
        "completed_runs": completed,
        "failed_runs": failed,
        "average_stage": round(avg_stages, 1),
    }


def extract_training_curve(metrics: dict[str, Any]) -> list[dict[str, float]]:
    """Extract training curve data points from experiment metrics."""
    curve: list[dict[str, float]] = []

    training_log = metrics.get("training_log", [])
    if isinstance(training_log, list):
        for entry in training_log:
            if isinstance(entry, dict):
                point = {}
                for key in ("epoch", "step", "loss", "accuracy", "lr"):
                    if key in entry:
                        try:
                            point[key] = float(entry[key])
                        except (ValueError, TypeError):
                            pass
                if point:
                    curve.append(point)

    return curve


================================================
FILE: researchclaw/data/__init__.py
================================================
"""Static data assets for the ResearchClaw pipeline."""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Framework documentation
# ---------------------------------------------------------------------------

_FRAMEWORK_DOCS_DIR = Path(__file__).parent / "framework_docs"

# Map of framework identifier -> (doc filename, keyword patterns for detection)
_FRAMEWORK_REGISTRY: dict[str, dict[str, Any]] = {
    "trl": {
        "file": "trl.md",
        "keywords": ["trl", "sft", "dpo", "grpo", "ppo trainer", "rlhf",
                      "sfttrainer", "dpotrainer", "grpotrainer"],
    },
    "peft": {
        "file": "peft.md",
        "keywords": ["peft", "lora", "qlora", "adapter", "low-rank",
                      "parameter-efficient", "dora"],
    },
    "transformers_training": {
        "file": "transformers_training.md",
        "keywords": ["transformers", "huggingface", "trainer", "trainingarguments",
                      "automodel", "fine-tun"],
    },
    "llamafactory": {
        "file": "llamafactory.md",
        "keywords": ["llamafactory", "llama-factory", "llama factory"],
    },
    "axolotl": {
        "file": "axolotl.md",
        "keywords": ["axolotl"],
    },
}


def detect_frameworks(topic: str, hypothesis: str = "", plan: str = "") -> list[str]:
    """Detect which ML training frameworks are relevant based on topic/hypothesis/plan.

    Returns a list of framework identifiers (e.g., ["trl", "peft"]).
    """
    combined = (topic + " " + hypothesis + " " + plan).lower()
    matched: list[str] = []
    for fw_id, info in _FRAMEWORK_REGISTRY.items():
        for kw in info["keywords"]:
            if kw in combined:
                matched.append(fw_id)
                break
    return matched


def load_framework_docs(framework_ids: list[str], max_chars: int = 8000) -> str:
    """Load and concatenate framework API documentation for the given IDs.

    Returns a single string with all relevant docs, truncated to max_chars
    to avoid overwhelming the prompt context.
    """
    parts: list[str] = []
    total = 0
    for fw_id in framework_ids:
        info = _FRAMEWORK_REGISTRY.get(fw_id)
        if not info:
            continue
        doc_path = _FRAMEWORK_DOCS_DIR / info["file"]
        if not doc_path.exists():
            logger.warning("Framework doc not found: %s", doc_path)
            continue
        content = doc_path.read_text(encoding="utf-8")
        if total + len(content) > max_chars:
            remaining = max_chars - total
            if remaining > 500:
                content = content[:remaining] + "\n... (truncated)\n"
            else:
                break
        parts.append(content)
        total += len(content)

    if not parts:
        return ""

    header = (
        "\n## Framework API Documentation (auto-detected)\n"
        "The following API references are relevant to your experiment. "
        "Use these exact APIs and patterns — do NOT guess the API.\n\n"
    )
    return header + "\n---\n\n".join(parts)

_SEMINAL_PAPERS_PATH = Path(__file__).parent / "seminal_papers.yaml"
_CACHE: list[dict[str, Any]] | None = None


def _load_all() -> list[dict[str, Any]]:
    """Load and cache the seminal papers list."""
    global _CACHE  # noqa: PLW0603
    if _CACHE is not None:
        return _CACHE
    try:
        data = yaml.safe_load(_SEMINAL_PAPERS_PATH.read_text(encoding="utf-8"))
        _CACHE = data.get("papers", []) if isinstance(data, dict) else []
    except Exception:  # noqa: BLE001
        logger.warning("Failed to load seminal_papers.yaml", exc_info=True)
        _CACHE = []
    return _CACHE


def load_seminal_papers(topic: str) -> list[dict[str, Any]]:
    """Return seminal papers whose keywords overlap with *topic*.

    Each returned dict has: title, authors, year, venue, cite_key, keywords.
    Matching is case-insensitive substring on the topic string.
    """
    all_papers = _load_all()
    topic_lower = topic.lower()
    matched: list[dict[str, Any]] = []
    seen_keys: set[str] = set()

    for paper in all_papers:
        keywords = paper.get("keywords", [])
        if not isinstance(keywords, list):
            continue
        for kw in keywords:
            if isinstance(kw, str) and kw.lower() in topic_lower:
                ck = paper.get("cite_key", "")
                if ck not in seen_keys:
                    seen_keys.add(ck)
                    matched.append(paper)
                break

    logger.debug(
        "load_seminal_papers(%r): matched %d papers", topic, len(matched)
    )
    return matched


================================================
FILE: researchclaw/data/benchmark_knowledge.yaml
================================================
# BenchmarkAgent Knowledge Base
# Domain-indexed registry of standard benchmarks, datasets, and baselines
# for automated experiment design.
#
# Fields:
#   - standard_benchmarks: canonical datasets for the domain
#     - tier: 1 (pre-cached) | 2 (downloadable) | 3 (too large)
#     - api: Python one-liner to load the dataset
#     - size_mb: approximate download size
#     - metrics: standard evaluation metrics
#   - common_baselines: well-known methods with open-source implementations
#     - source: Python code to instantiate the model/method
#     - paper: original paper citation
#     - pip: additional pip packages needed (if any)

domains:

  # ── Computer Vision ─────────────────────────────────────────────

  image_classification:
    required_baselines:
      - ResNet-50
      - ViT-B/16
    keywords:
      - image classification
      - visual recognition
      - object recognition
      - image categorization
    standard_benchmarks:
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 10
        samples: 60000
        metrics: [accuracy, top1_accuracy]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False, transform=transform)"
      - name: CIFAR-100
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 100
        samples: 60000
        metrics: [accuracy, top1_accuracy, top5_accuracy]
        api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False, transform=transform)"
      - name: FashionMNIST
        tier: 1
        domain: image_classification
        size_mb: 50
        classes: 10
        samples: 70000
        metrics: [accuracy]
        api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False, transform=transform)"
      - name: STL-10
        tier: 1
        domain: image_classification
        size_mb: 2640
        classes: 10
        samples: 113000
        metrics: [accuracy]
        api: "torchvision.datasets.STL10(root='/workspace/data', split='train', download=False, transform=transform)"
      - name: Tiny-ImageNet
        tier: 2
        domain: image_classification
        size_mb: 237
        classes: 200
        samples: 110000
        metrics: [top1_accuracy, top5_accuracy]
        api: "datasets.load_dataset('Maysee/tiny-imagenet', cache_dir='/workspace/data/hf')"
      - name: Caltech-101
        tier: 2
        domain: image_classification
        size_mb: 131
        classes: 101
        samples: 9146
        metrics: [accuracy]
        api: "torchvision.datasets.Caltech101(root='/workspace/data', download=True, transform=transform)"
      - name: ImageNet-1K
        tier: 3
        domain: image_classification
        size_mb: 168000
        classes: 1000
        samples: 1281167
        metrics: [top1_accuracy, top5_accuracy]
        alternatives: ["Tiny-ImageNet", "CIFAR-100", "STL-10"]
    common_baselines:
      - name: ResNet-18
        source: "torchvision.models.resnet18(weights='IMAGENET1K_V1')"
        paper: "He et al., Deep Residual Learning, CVPR 2016"
        pip: []
      - name: ResNet-50
        source: "torchvision.models.resnet50(weights='IMAGENET1K_V2')"
        paper: "He et al., Deep Residual Learning, CVPR 2016"
        pip: []
      - name: ViT-B/16
        source: "timm.create_model('vit_base_patch16_224', pretrained=True)"
        paper: "Dosovitskiy et al., An Image is Worth 16x16 Words, ICLR 2021"
        pip: [timm]
      - name: EfficientNet-B0
        source: "torchvision.models.efficientnet_b0(weights='IMAGENET1K_V1')"
        paper: "Tan & Le, EfficientNet, ICML 2019"
        pip: []
      - name: MobileNetV3
        source: "torchvision.models.mobilenet_v3_small(weights='IMAGENET1K_V1')"
        paper: "Howard et al., MobileNetV3, ICCV 2019"
        pip: []

  knowledge_distillation:
    required_baselines:
      - KD (Hinton)
    keywords:
      - knowledge distillation
      - model compression
      - teacher student
      - dark knowledge
      - distill
    standard_benchmarks:
      - name: CIFAR-100
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 100
        samples: 60000
        metrics: [accuracy, top1_accuracy]
        api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False, transform=transform)"
        note: "Most common KD benchmark — 100 classes provides rich teacher knowledge"
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 10
        samples: 60000
        metrics: [accuracy]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False, transform=transform)"
      - name: Tiny-ImageNet
        tier: 2
        domain: image_classification
        size_mb: 237
        classes: 200
        samples: 110000
        metrics: [top1_accuracy, top5_accuracy]
        api: "datasets.load_dataset('Maysee/tiny-imagenet', cache_dir='/workspace/data/hf')"
    common_baselines:
      - name: KD (Hinton)
        source: "Custom: KL-divergence on softened logits (temperature=4)"
        paper: "Hinton et al., Distilling the Knowledge in a Neural Network, 2015"
        pip: []
      - name: FitNet
        source: "Custom: Feature map regression (intermediate layers)"
        paper: "Romero et al., FitNets, ICLR 2015"
        pip: []
      - name: CRD
        source: "Custom: Contrastive representation distillation"
        paper: "Tian et al., Contrastive Representation Distillation, ICLR 2020"
        pip: []
      - name: DKD
        source: "Custom: Decoupled knowledge distillation (target + non-target)"
        paper: "Zhao et al., Decoupled Knowledge Distillation, CVPR 2022"
        pip: []

  continual_learning:
    required_baselines:
      - EWC
      - Fine-tuning (lower bound)
    keywords:
      - continual learning
      - lifelong learning
      - catastrophic forgetting
      - incremental learning
      - class-incremental
    standard_benchmarks:
      - name: Split CIFAR-100
        tier: 1
        domain: continual_learning
        size_mb: 170
        classes: 100
        samples: 60000
        metrics: [average_accuracy, forgetting_rate, backward_transfer]
        api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)"
        note: "Split into 10 or 20 tasks (10 or 5 classes each)"
      - name: Split MNIST
        tier: 1
        domain: continual_learning
        size_mb: 50
        classes: 10
        samples: 70000
        metrics: [average_accuracy, forgetting_rate]
        api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)"
        note: "Split into 5 tasks (2 classes each)"
      - name: Permuted MNIST
        tier: 1
        domain: continual_learning
        size_mb: 50
        classes: 10
        samples: 70000
        metrics: [average_accuracy, backward_transfer]
        api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)"
        note: "Each task applies a random pixel permutation"
    common_baselines:
      - name: EWC
        source: "Custom: Elastic Weight Consolidation (Fisher penalty)"
        paper: "Kirkpatrick et al., Overcoming catastrophic forgetting, PNAS 2017"
        pip: []
      - name: SI (Synaptic Intelligence)
        source: "Custom: Path integral of parameter importance"
        paper: "Zenke et al., Continual Learning Through Synaptic Intelligence, ICML 2017"
        pip: []
      - name: PackNet
        source: "Custom: Progressive pruning and re-training"
        paper: "Mallya & Lazebnik, PackNet, CVPR 2018"
        pip: []
      - name: Experience Replay
        source: "Custom: Maintain memory buffer of past examples"
        paper: "Chaudhry et al., Continual Learning with Tiny Episodic Memories, 2019"
        pip: []

  # ── Natural Language Processing ─────────────────────────────────

  text_classification:
    keywords:
      - text classification
      - sentiment analysis
      - document classification
      - topic classification
      - natural language understanding
      - nlu
    standard_benchmarks:
      - name: IMDB
        tier: 2
        domain: nlp_sentiment
        size_mb: 80
        classes: 2
        samples: 50000
        metrics: [accuracy, f1_score]
        api: "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')"
      - name: SST-2
        tier: 2
        domain: nlp_sentiment
        size_mb: 7
        classes: 2
        samples: 70000
        metrics: [accuracy]
        api: "datasets.load_dataset('glue', 'sst2', cache_dir='/workspace/data/hf')"
      - name: AG News
        tier: 2
        domain: nlp_classification
        size_mb: 30
        classes: 4
        samples: 127600
        metrics: [accuracy, f1_macro]
        api: "datasets.load_dataset('ag_news', cache_dir='/workspace/data/hf')"
      - name: MNLI
        tier: 2
        domain: nlp_nli
        size_mb: 310
        classes: 3
        samples: 433000
        metrics: [accuracy]
        api: "datasets.load_dataset('glue', 'mnli', cache_dir='/workspace/data/hf')"
    common_baselines:
      - name: BERT-base
        source: "transformers.AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')"
        paper: "Devlin et al., BERT, NAACL 2019"
        pip: [transformers]
      - name: RoBERTa-base
        source: "transformers.AutoModelForSequenceClassification.from_pretrained('roberta-base')"
        paper: "Liu et al., RoBERTa, 2019"
        pip: [transformers]
      - name: DistilBERT
        source: "transformers.AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')"
        paper: "Sanh et al., DistilBERT, 2019"
        pip: [transformers]

  language_modeling:
    keywords:
      - language modeling
      - language model
      - perplexity
      - text generation
      - causal language model
      - autoregressive
    standard_benchmarks:
      - name: WikiText-103
        tier: 2
        domain: language_modeling
        size_mb: 500
        samples: 28000
        metrics: [perplexity, bits_per_character]
        api: "datasets.load_dataset('wikitext', 'wikitext-103-v1', cache_dir='/workspace/data/hf')"
      - name: WikiText-2
        tier: 2
        domain: language_modeling
        size_mb: 12
        samples: 2000
        metrics: [perplexity]
        api: "datasets.load_dataset('wikitext', 'wikitext-2-v1', cache_dir='/workspace/data/hf')"
      - name: Penn Treebank
        tier: 2
        domain: language_modeling
        size_mb: 5
        metrics: [perplexity]
        api: "datasets.load_dataset('ptb_text_only', cache_dir='/workspace/data/hf')"
    common_baselines:
      - name: GPT-2 Small
        source: "transformers.AutoModelForCausalLM.from_pretrained('gpt2')"
        paper: "Radford et al., Language Models are Unsupervised Multitask Learners, 2019"
        pip: [transformers]
      - name: LSTM LM
        source: "Custom: 2-layer LSTM with tied embeddings"
        paper: "Merity et al., AWD-LSTM, ICLR 2018"
        pip: []

  question_answering:
    keywords:
      - question answering
      - reading comprehension
      - qa
      - extractive qa
    standard_benchmarks:
      - name: SQuAD v2
        tier: 2
        domain: question_answering
        size_mb: 45
        samples: 150000
        metrics: [exact_match, f1_score]
        api: "datasets.load_dataset('squad_v2', cache_dir='/workspace/data/hf')"
      - name: SQuAD v1.1
        tier: 2
        domain: question_answering
        size_mb: 35
        samples: 100000
        metrics: [exact_match, f1_score]
        api: "datasets.load_dataset('squad', cache_dir='/workspace/data/hf')"
    common_baselines:
      - name: BERT-base QA
        source: "transformers.AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')"
        paper: "Devlin et al., BERT, NAACL 2019"
        pip: [transformers]

  # ── Reinforcement Learning ──────────────────────────────────────

  reinforcement_learning:
    required_baselines:
      - PPO
      - Random Policy
    keywords:
      - reinforcement learning
      - reward shaping
      - policy gradient
      - value function
      - q-learning
      - actor-critic
      - markov decision process
      - mdp
    standard_benchmarks:
      - name: CartPole-v1
        tier: 1
        domain: classic_control
        size_mb: 0
        metrics: [episode_reward, episode_length]
        api: "gymnasium.make('CartPole-v1')"
        note: "Simple baseline environment — episodic, discrete action"
      - name: LunarLander-v3
        tier: 1
        domain: classic_control
        size_mb: 0
        metrics: [episode_reward]
        api: "gymnasium.make('LunarLander-v3')"
      - name: Acrobot-v1
        tier: 1
        domain: classic_control
        size_mb: 0
        metrics: [episode_reward, steps_to_solve]
        api: "gymnasium.make('Acrobot-v1')"
      - name: HalfCheetah-v5
        tier: 2
        domain: mujoco
        size_mb: 50
        metrics: [average_return, episode_reward]
        api: "gymnasium.make('HalfCheetah-v5')"
        pip: ["gymnasium[mujoco]"]
      - name: Hopper-v5
        tier: 2
        domain: mujoco
        size_mb: 50
        metrics: [average_return]
        api: "gymnasium.make('Hopper-v5')"
        pip: ["gymnasium[mujoco]"]
    common_baselines:
      - name: PPO
        source: "stable_baselines3.PPO('MlpPolicy', env)"
        paper: "Schulman et al., PPO, 2017"
        pip: [stable-baselines3]
      - name: SAC
        source: "stable_baselines3.SAC('MlpPolicy', env)"
        paper: "Haarnoja et al., Soft Actor-Critic, ICML 2018"
        pip: [stable-baselines3]
      - name: DQN
        source: "stable_baselines3.DQN('MlpPolicy', env)"
        paper: "Mnih et al., Playing Atari with Deep RL, 2015"
        pip: [stable-baselines3]
      - name: A2C
        source: "stable_baselines3.A2C('MlpPolicy', env)"
        paper: "Mnih et al., Asynchronous Methods for Deep RL, ICML 2016"
        pip: [stable-baselines3]

  # ── Graph Neural Networks ───────────────────────────────────────

  graph_neural_networks:
    keywords:
      - graph neural network
      - gnn
      - node classification
      - graph classification
      - link prediction
      - message passing
      - graph convolution
      - gcn
      - graph attention
    standard_benchmarks:
      - name: Cora
        tier: 2
        domain: node_classification
        size_mb: 5
        classes: 7
        samples: 2708
        metrics: [accuracy, f1_macro]
        api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='Cora')"
        pip: [torch-geometric]
      - name: CiteSeer
        tier: 2
        domain: node_classification
        size_mb: 5
        classes: 6
        samples: 3327
        metrics: [accuracy]
        api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='CiteSeer')"
        pip: [torch-geometric]
      - name: PubMed
        tier: 2
        domain: node_classification
        size_mb: 20
        classes: 3
        samples: 19717
        metrics: [accuracy]
        api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='PubMed')"
        pip: [torch-geometric]
      - name: ogbn-arxiv
        tier: 2
        domain: node_classification
        size_mb: 200
        samples: 169343
        metrics: [accuracy]
        api: "ogb.nodeproppred.PygNodePropPredDataset(name='ogbn-arxiv', root='/workspace/data')"
        pip: [ogb, torch-geometric]
      - name: ogbg-molhiv
        tier: 2
        domain: graph_classification
        size_mb: 50
        samples: 41127
        metrics: [auroc]
        api: "ogb.graphproppred.PygGraphPropPredDataset(name='ogbg-molhiv', root='/workspace/data')"
        pip: [ogb, torch-geometric]
    common_baselines:
      - name: GCN
        source: "torch_geometric.nn.GCNConv"
        paper: "Kipf & Welling, Semi-Supervised Classification with GCN, ICLR 2017"
        pip: [torch-geometric]
      - name: GAT
        source: "torch_geometric.nn.GATConv"
        paper: "Velickovic et al., Graph Attention Networks, ICLR 2018"
        pip: [torch-geometric]
      - name: GraphSAGE
        source: "torch_geometric.nn.SAGEConv"
        paper: "Hamilton et al., Inductive Representation Learning, NeurIPS 2017"
        pip: [torch-geometric]
      - name: GIN
        source: "torch_geometric.nn.GINConv"
        paper: "Xu et al., How Powerful are GNNs?, ICLR 2019"
        pip: [torch-geometric]

  # ── Meta-Learning ───────────────────────────────────────────────

  meta_learning:
    keywords:
      - meta-learning
      - few-shot learning
      - learning to learn
      - meta learning
      - few shot
      - n-way k-shot
    standard_benchmarks:
      - name: Omniglot
        tier: 2
        domain: few_shot_classification
        size_mb: 17
        classes: 1623
        samples: 32460
        metrics: [accuracy]
        api: "torchvision.datasets.Omniglot(root='/workspace/data', download=True)"
      - name: Mini-ImageNet
        tier: 2
        domain: few_shot_classification
        size_mb: 2900
        classes: 100
        samples: 60000
        metrics: [accuracy]
        api: "datasets.load_dataset('nielsr/mini-imagenet', cache_dir='/workspace/data/hf')"
        note: "Standard 5-way 1-shot and 5-way 5-shot evaluation"
      - name: CIFAR-FS
        tier: 1
        domain: few_shot_classification
        size_mb: 170
        classes: 100
        samples: 60000
        metrics: [accuracy]
        api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)"
        note: "CIFAR-100 re-split into 64/16/20 base/val/novel classes"
    common_baselines:
      - name: MAML
        source: "Custom: Model-Agnostic Meta-Learning with inner/outer loop"
        paper: "Finn et al., MAML, ICML 2017"
        pip: []
      - name: Prototypical Networks
        source: "Custom: Prototype-based classification in embedding space"
        paper: "Snell et al., Prototypical Networks, NeurIPS 2017"
        pip: []
      - name: Matching Networks
        source: "Custom: Attention-based few-shot classifier"
        paper: "Vinyals et al., Matching Networks, NeurIPS 2016"
        pip: []

  # ── Generative Models ──────────────────────────────────────────

  generative_models:
    required_baselines:
      - VAE
    keywords:
      - generative model
      - gan
      - vae
      - variational autoencoder
      - generative adversarial
      - diffusion model
      - image generation
      - score matching
    standard_benchmarks:
      - name: CIFAR-10
        tier: 1
        domain: image_generation
        size_mb: 170
        metrics: [fid, inception_score, is]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"
      - name: MNIST
        tier: 1
        domain: image_generation
        size_mb: 50
        metrics: [fid, reconstruction_error]
        api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)"
      - name: CelebA
        tier: 2
        domain: image_generation
        size_mb: 1400
        samples: 202599
        metrics: [fid, lpips]
        api: "torchvision.datasets.CelebA(root='/workspace/data', split='train', download=True)"
    common_baselines:
      - name: DCGAN
        source: "Custom: Deep Convolutional GAN"
        paper: "Radford et al., Unsupervised Representation Learning, ICLR 2016"
        pip: []
      - name: VAE
        source: "Custom: Variational Autoencoder with KL divergence"
        paper: "Kingma & Welling, Auto-Encoding Variational Bayes, ICLR 2014"
        pip: []
      - name: DDPM
        source: "Custom: Denoising Diffusion Probabilistic Model"
        paper: "Ho et al., Denoising Diffusion Probabilistic Models, NeurIPS 2020"
        pip: []

  # ── Transfer Learning & Domain Adaptation ───────────────────────

  transfer_learning:
    keywords:
      - transfer learning
      - domain adaptation
      - domain shift
      - distribution shift
      - covariate shift
      - domain generalization
      - out-of-distribution
      - ood
    standard_benchmarks:
      - name: CIFAR-10-C
        tier: 2
        domain: robustness
        size_mb: 2700
        classes: 10
        metrics: [accuracy, mean_corruption_error]
        api: "datasets.load_dataset('cifar10_corrupted', cache_dir='/workspace/data/hf')"
        note: "15 corruption types × 5 severity levels"
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 10
        metrics: [accuracy]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"
        note: "Source domain for corruption/shift experiments"
      - name: Office-31
        tier: 2
        domain: domain_adaptation
        size_mb: 800
        classes: 31
        samples: 4652
        metrics: [accuracy]
        api: "Custom download from official source"
    common_baselines:
      - name: DANN
        source: "Custom: Domain-Adversarial Neural Network (gradient reversal)"
        paper: "Ganin et al., Domain-Adversarial Training, JMLR 2016"
        pip: []
      - name: CORAL
        source: "Custom: Correlation alignment loss"
        paper: "Sun & Saenko, Deep CORAL, ECCV 2016"
        pip: []
      - name: AugMax
        source: "Custom: Adversarial data augmentation"
        paper: "Wang et al., AugMax, NeurIPS 2021"
        pip: []

  # ── Neural Architecture Search ──────────────────────────────────

  neural_architecture_search:
    keywords:
      - neural architecture search
      - nas
      - architecture search
      - automl architecture
    standard_benchmarks:
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 10
        metrics: [accuracy, params, flops]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"
      - name: NAS-Bench-201
        tier: 2
        domain: nas_benchmark
        size_mb: 2100
        metrics: [accuracy, latency]
        api: "Custom: nats_bench.create('/workspace/data/NATS-tss-v1_0-3ffb9-simple', 'tss', fast_mode=True)"
        pip: [nats_bench]
    common_baselines:
      - name: DARTS
        source: "Custom: Differentiable Architecture Search"
        paper: "Liu et al., DARTS, ICLR 2019"
        pip: []
      - name: Random Search
        source: "Custom: Random architecture sampling + training"
        paper: "Li & Talwalkar, Random Search and Reproducibility, UAI 2020"
        pip: []

  # ── Self-Supervised Learning ────────────────────────────────────

  self_supervised_learning:
    keywords:
      - self-supervised
      - contrastive learning
      - self supervised
      - pretext task
      - representation learning
      - unsupervised representation
      - simclr
      - byol
      - moco
    standard_benchmarks:
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        classes: 10
        metrics: [linear_probe_accuracy, knn_accuracy]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"
      - name: STL-10
        tier: 1
        domain: image_classification
        size_mb: 2640
        classes: 10
        metrics: [linear_probe_accuracy]
        api: "torchvision.datasets.STL10(root='/workspace/data', split='unlabeled', download=False)"
        note: "100K unlabeled images for pre-training, 13K labeled for evaluation"
    common_baselines:
      - name: SimCLR
        source: "Custom: Contrastive learning with NT-Xent loss"
        paper: "Chen et al., SimCLR, ICML 2020"
        pip: []
      - name: BYOL
        source: "Custom: Bootstrap Your Own Latent (no negatives)"
        paper: "Grill et al., BYOL, NeurIPS 2020"
        pip: []
      - name: MoCo v2
        source: "Custom: Momentum Contrast with MLP head"
        paper: "Chen et al., Improved Baselines for MoCo, 2020"
        pip: []

  # ── Optimizer Research ──────────────────────────────────────────
  # NOTE: This domain is specifically for research ON optimizers/learning
  # rate schedules. Do NOT match generic topics that merely USE optimization.
  # Keywords are narrow to avoid false matches (e.g., "combinatorial
  # optimization" or "PDE optimization" should NOT match this domain).

  optimizer_research:
    keywords:
      - optimizer design
      - learning rate schedule
      - adaptive learning rate
      - gradient descent method
      - second-order optimization
      - optimizer convergence
    standard_benchmarks:
      - name: CIFAR-10
        tier: 1
        domain: image_classification
        size_mb: 170
        metrics: [accuracy, convergence_speed, final_loss]
        api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"
      - name: CIFAR-100
        tier: 1
        domain: image_classification
        size_mb: 170
        metrics: [accuracy, convergence_speed]
        api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)"
      - name: FashionMNIST
        tier: 1
        domain: image_classification
        size_mb: 50
        metrics: [accuracy, convergence_speed]
        api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False)"
    common_baselines:
      - name: AdamW
        source: "torch.optim.AdamW(params, lr=1e-3, weight_decay=0.01)"
        paper: "Loshchilov & Hutter, Decoupled Weight Decay, ICLR 2019"
        pip: []
      - name: SGD + Cosine Annealing
        source: "torch.optim.SGD + CosineAnnealingLR"
        paper: "Loshchilov & Hutter, SGDR, ICLR 2017"
        pip: []
      - name: LAMB
        source: "Custom: Layer-wise Adaptive Moments for Batch Training"
        paper: "You et al., Large Batch Optimization, ICLR 2020"
        pip: []
      - name: SAM (Sharpness-Aware Minimization)
        source: "Custom: Sharpness-Aware Minimization"
        paper: "Foret et al., SAM, ICLR 2021"
        pip: []

  # ── LLM Fine-Tuning ────────────────────────────────────────────

  llm_finetuning:
    keywords:
      - fine-tuning
      - fine tuning
      - finetuning
      - instruction tuning
      - alignment
      - rlhf
      - dpo
      - lora
      - qlora
      - adapter tuning
    standard_benchmarks:
      - name: Alpaca
        tier: 2
        domain: instruction_following
        size_mb: 25
        samples: 52000
        metrics: [loss, perplexity]
        api: "datasets.load_dataset('tatsu-lab/alpaca', cache_dir='/workspace/data/hf')"
      - name: MMLU
        tier: 2
        domain: general_knowledge
        size_mb: 12
        classes: 57
        metrics: [accuracy]
        api: "datasets.load_dataset('cais/mmlu', 'all', cache_dir='/workspace/data/hf')"
      - name: HellaSwag
        tier: 2
        domain: commonsense_reasoning
        size_mb: 70
        metrics: [accuracy]
        api: "datasets.load_dataset('Rowan/hellaswag', cache_dir='/workspace/data/hf')"
    common_baselines:
      - name: Full Fine-Tuning
        source: "Standard supervised fine-tuning on all parameters"
        paper: "Baseline approach"
        pip: [transformers]
      - name: LoRA
        source: "peft.LoraConfig(r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'])"
        paper: "Hu et al., LoRA, ICLR 2022"
        pip: [peft]
      - name: QLoRA
        source: "BitsAndBytesConfig(load_in_4bit=True) + LoRA"
        paper: "Dettmers et al., QLoRA, NeurIPS 2023"
        pip: [peft, bitsandbytes]

  # ── Tabular / Classical ML ──────────────────────────────────────

  tabular_learning:
    keywords:
      - tabular
      - tabular data
      - tabular learning
      - structured data
      - feature engineering
      - gradient boosting
      - xgboost
    standard_benchmarks:
      - name: UCI Adult
        tier: 2
        domain: binary_classification
        size_mb: 5
        classes: 2
        samples: 48842
        metrics: [accuracy, auroc, f1_score]
        api: "datasets.load_dataset('scikit-learn/adult-census-income', cache_dir='/workspace/data/hf')"
      - name: California Housing
        tier: 2
        domain: regression
        size_mb: 1
        samples: 20640
        metrics: [rmse, mae, r2]
        api: "sklearn.datasets.fetch_california_housing()"
      - name: Forest Cover Type
        tier: 2
        domain: multiclass_classification
        size_mb: 75
        classes: 7
        samples: 581012
        metrics: [accuracy, f1_macro]
        api: "sklearn.datasets.fetch_covtype()"
    common_baselines:
      - name: XGBoost
        source: "xgboost.XGBClassifier(n_estimators=100, max_depth=6)"
        paper: "Chen & Guestrin, XGBoost, KDD 2016"
        pip: [xgboost]
      - name: Random Forest
        source: "sklearn.ensemble.RandomForestClassifier(n_estimators=100)"
        paper: "Breiman, Random Forests, Machine Learning 2001"
        pip: []
      - name: MLP
        source: "sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(256, 128))"
        paper: "Standard neural network baseline"
        pip: []


================================================
FILE: researchclaw/data/dataset_registry.yaml
================================================
# ResearchClaw Dataset Registry
# Tiers:
#   1 = pre-cached in Docker image (no network needed)
#   2 = downloadable via setup.py (network during setup phase)
#   3 = too large for in-experiment download (use alternatives)

# ── Tier 1: Pre-cached ──────────────────────────────────────────
- name: CIFAR-10
  tier: 1
  domain: image_classification
  size_mb: 170
  classes: 10
  samples: 60000
  api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)"

- name: CIFAR-100
  tier: 1
  domain: image_classification
  size_mb: 170
  classes: 100
  samples: 60000
  api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)"

- name: MNIST
  tier: 1
  domain: image_classification
  size_mb: 50
  classes: 10
  samples: 70000
  api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)"

- name: FashionMNIST
  tier: 1
  domain: image_classification
  size_mb: 50
  classes: 10
  samples: 70000
  api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False)"

- name: STL-10
  tier: 1
  domain: image_classification
  size_mb: 2640
  classes: 10
  samples: 113000
  api: "torchvision.datasets.STL10(root='/workspace/data', download=False)"

- name: SVHN
  tier: 1
  domain: image_classification
  size_mb: 450
  classes: 10
  samples: 99289
  api: "torchvision.datasets.SVHN(root='/workspace/data', download=False)"

# ── Tier 2: Downloadable ────────────────────────────────────────
- name: Tiny-ImageNet
  tier: 2
  domain: image_classification
  size_mb: 237
  classes: 200
  samples: 110000
  download: "setup.py with torchvision or direct URL download"
  note: "Best proxy for ImageNet experiments"

- name: Caltech-101
  tier: 2
  domain: image_classification
  size_mb: 131
  classes: 101
  samples: 9146
  api: "torchvision.datasets.Caltech101(root='/workspace/data', download=True)"

- name: Flowers-102
  tier: 2
  domain: image_classification
  size_mb: 340
  classes: 102
  samples: 8189
  api: "torchvision.datasets.Flowers102(root='/workspace/data', download=True)"

- name: IMDB
  tier: 2
  domain: nlp_sentiment
  size_mb: 80
  classes: 2
  samples: 50000
  api: "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')"

- name: AG News
  tier: 2
  domain: nlp_classification
  size_mb: 30
  classes: 4
  samples: 127600
  api: "datasets.load_dataset('ag_news', cache_dir='/workspace/data/hf')"

- name: WikiText-103
  tier: 2
  domain: language_modeling
  size_mb: 500
  samples: 28000
  api: "datasets.load_dataset('wikitext', 'wikitext-103-v1', cache_dir='/workspace/data/hf')"

- name: SST-2
  tier: 2
  domain: nlp_sentiment
  size_mb: 7
  classes: 2
  samples: 70000
  api: "datasets.load_dataset('glue', 'sst2', cache_dir='/workspace/data/hf')"

- name: SQuAD v2
  tier: 2
  domain: question_answering
  size_mb: 45
  samples: 150000
  api: "datasets.load_dataset('squad_v2', cache_dir='/workspace/data/hf')"

- name: ogbg-molhiv
  tier: 2
  domain: graph_classification
  size_mb: 50
  samples: 41127
  api: "PygGraphPropPredDataset(name='ogbg-molhiv', root='/workspace/data')"
  note: "Requires ogb and torch-geometric"

- name: ogbn-arxiv
  tier: 2
  domain: node_classification
  size_mb: 200
  samples: 169343
  api: "PygNodePropPredDataset(name='ogbn-arxiv', root='/workspace/data')"

# ── Tier 3: Too large ───────────────────────────────────────────
- name: ImageNet-1K
  tier: 3
  domain: image_classification
  size_mb: 168000
  classes: 1000
  samples: 1281167
  alternatives:
    - "Tiny-ImageNet (200 classes, 237MB)"
    - "CIFAR-100 (100 classes, 170MB)"
    - "STL-10 (10 classes, 96x96 images)"

- name: LAION-400M
  tier: 3
  domain: vision_language
  size_mb: 1000000
  alternatives:
    - "COCO Captions (HuggingFace, ~1GB)"
    - "Flickr30K (HuggingFace)"

- name: The Pile
  tier: 3
  domain: language_modeling
  size_mb: 825000
  alternatives:
    - "WikiText-103 (500MB)"
    - "C4 subset via HuggingFace streaming"


================================================
FILE: researchclaw/data/docker_profiles.yaml
================================================
# Docker image profiles for domain-specific experiment execution.
# Each domain can specify its own image with pre-installed packages.
# The sandbox engine uses domain_id → docker profile mapping.

profiles:
  ml_base:
    image: "researchclaw/sandbox-ml:latest"
    packages:
      - torch
      - torchvision
      - transformers
      - datasets
      - scikit-learn
      - numpy
      - scipy
      - matplotlib
      - tqdm
      - gymnasium
      - stable-baselines3
    gpu: true
    memory_limit_mb: 8192

  physics:
    image: "researchclaw/sandbox-physics:latest"
    packages:
      - numpy
      - scipy
      - matplotlib
      - jax
      - jaxlib
      - findiff
    gpu: false
    memory_limit_mb: 4096

  chemistry:
    image: "researchclaw/sandbox-chemistry:latest"
    packages:
      - pyscf
      - rdkit
      - numpy
      - scipy
      - matplotlib
      - pandas
    gpu: false
    memory_limit_mb: 4096

  biology:
    image: "researchclaw/sandbox-biology:latest"
    packages:
      - scanpy
      - anndata
      - leidenalg
      - biopython
      - scikit-learn
      - numpy
      - matplotlib
      - pandas
    gpu: false
    memory_limit_mb: 16384

  economics:
    image: "researchclaw/sandbox-economics:latest"
    packages:
      - statsmodels
      - linearmodels
      - pandas
      - numpy
      - scipy
      - matplotlib
    gpu: false
    memory_limit_mb: 4096

  math:
    image: "researchclaw/sandbox-math:latest"
    packages:
      - numpy
      - scipy
      - sympy
      - matplotlib
      - findiff
    gpu: false
    memory_limit_mb: 4096

  security:
    image: "researchclaw/sandbox-security:latest"
    packages:
      - scikit-learn
      - xgboost
      - pandas
      - numpy
      - matplotlib
    gpu: false
    memory_limit_mb: 4096
    network: "none"

  robotics:
    image: "researchclaw/sandbox-robotics:latest"
    packages:
      - gymnasium
      - mujoco
      - stable-baselines3
      - torch
      - numpy
      - matplotlib
    gpu: true
    memory_limit_mb: 8192

  generic:
    image: "researchclaw/sandbox-generic:latest"
    packages:
      - numpy
      - scipy
      - matplotlib
      - pandas
      - scikit-learn
    gpu: false
    memory_limit_mb: 4096

# Domain ID → profile mapping
domain_map:
  ml_vision: ml_base
  ml_nlp: ml_base
  ml_rl: ml_base
  ml_graph: ml_base
  ml_tabular: ml_base
  ml_generative: ml_base
  ml_compression: ml_base
  ml_generic: ml_base
  physics_simulation: physics
  physics_pde: physics
  physics_quantum: chemistry
  chemistry_qm: chemistry
  chemistry_molprop: chemistry
  chemistry_general: chemistry
  biology_singlecell: biology
  biology_genomics: biology
  biology_protein: biology
  biology_general: biology
  economics_empirical: economics
  economics_general: economics
  mathematics_numerical: math
  mathematics_optimization: math
  mathematics_general: math
  security_detection: security
  robotics_control: robotics
  generic: generic


================================================
FILE: researchclaw/data/framework_docs/axolotl.md
================================================
# Axolotl — API Quick Reference

## Installation
```bash
pip install axolotl
# or
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl && pip install -e .
```

## CLI Usage
```bash
# Train
accelerate launch -m axolotl.cli.train config.yaml

# Inference
accelerate launch -m axolotl.cli.inference config.yaml --lora_model_dir=./output

# Merge LoRA adapter
python -m axolotl.cli.merge_lora config.yaml --lora_model_dir=./output
```

## Training Config (YAML)
```yaml
base_model: Qwen/Qwen2.5-3B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
trust_remote_code: true

# Load in 4-bit (QLoRA)
load_in_4bit: true
adapter: qlora                 # qlora, lora, or omit for full fine-tune
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true       # target all linear layers

# Dataset
datasets:
  - path: my_data.jsonl
    type: alpaca               # alpaca, sharegpt, completion, etc.
    # OR custom format:
    # type:
    #   field_instruction: instruction
    #   field_input: input
    #   field_output: output
    #   format: "{instruction}\n{input}"

# Training
sequence_len: 2048
sample_packing: true           # pack short sequences together
pad_to_sequence_len: true

num_epochs: 3
micro_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2e-5
lr_scheduler: cosine
warmup_ratio: 0.1
optimizer: paged_adamw_8bit

# Precision
bf16: auto                     # auto-detect GPU capability
tf32: true

# Memory optimization
gradient_checkpointing: true
flash_attention: true          # use flash attention 2

# Logging & saving
logging_steps: 10
save_strategy: steps
save_steps: 500
save_total_limit: 3
output_dir: ./output

# Evaluation
val_set_size: 0.05
eval_steps: 500

# Weights & Biases (optional)
wandb_project: my-project
wandb_run_id:
```

## Dataset Formats
```yaml
# Alpaca format
datasets:
  - path: data.jsonl
    type: alpaca
# {"instruction": "...", "input": "...", "output": "..."}

# ShareGPT format
datasets:
  - path: data.jsonl
    type: sharegpt
# {"conversations": [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]}

# Completion (raw text)
datasets:
  - path: data.jsonl
    type: completion
# {"text": "full text here"}

# HuggingFace dataset
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
```

## DPO Config
```yaml
rl: dpo                        # enables DPO training
# Dataset must have "chosen" and "rejected" fields
datasets:
  - path: dpo_data.jsonl
    type: chat_template.default
    field_messages: chosen      # for chosen responses
    # split into chosen/rejected pairs
```

## Key Tips
- `sample_packing: true` greatly improves throughput on short sequences
- `flash_attention: true` reduces memory and speeds up attention (requires compatible GPU)
- `lora_target_linear: true` is the easiest way to target all linear layers
- `bf16: auto` auto-detects GPU capability
- DeepSpeed integration: add `deepspeed: deepspeed_configs/zero2.json`
- Multi-GPU: use `accelerate launch` with proper config


================================================
FILE: researchclaw/data/framework_docs/llamafactory.md
================================================
# LLaMA-Factory — API Quick Reference

## Installation
```bash
pip install llamafactory
# or
git clone https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory && pip install -e .
```

## CLI Usage (Primary Interface)
```bash
# Fine-tune with a YAML config
llamafactory-cli train config.yaml

# Chat with a fine-tuned model
llamafactory-cli chat config.yaml

# Export/merge LoRA adapter
llamafactory-cli export config.yaml

# Launch web UI
llamafactory-cli webui
```

## Training Config (YAML)
```yaml
### Model
model_name_or_path: Qwen/Qwen2.5-3B
trust_remote_code: true

### Method (LoRA)
stage: sft                    # sft, pt (pretrain), rm, ppo, dpo, kto, orpo
do_train: true
finetuning_type: lora         # lora, freeze, full
lora_rank: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target: all              # "all" for all linear, or comma-separated names

### Dataset
dataset: alpaca_en            # registered dataset name or custom path
template: qwen                # chat template: qwen, llama3, mistral, chatglm, etc.
cutoff_len: 1024
preprocessing_num_workers: 8

### Training
output_dir: ./output
num_train_epochs: 3.0
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2.0e-5
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
gradient_checkpointing: true

### Evaluation
val_size: 0.05
per_device_eval_batch_size: 8
eval_strategy: steps
eval_steps: 500

### Logging
logging_steps: 10
save_steps: 500
save_total_limit: 3
report_to: none

### Quantization (QLoRA)
quantization_bit: 4           # 4 or 8
quantization_method: bitsandbytes  # bitsandbytes, gptq, awq
```

## Custom Dataset Registration
```json
// data/dataset_info.json — register your dataset
{
  "my_dataset": {
    "file_name": "my_data.json",
    "formatting": "alpaca",
    "columns": {
      "prompt": "instruction",
      "query": "input",
      "response": "output"
    }
  }
}
```

### Dataset Formats
```json
// Alpaca format
{"instruction": "Translate to French", "input": "Hello", "output": "Bonjour"}

// ShareGPT format
{"conversations": [
  {"from": "human", "value": "Hello"},
  {"from": "gpt", "value": "Hi there!"}
]}
```

## DPO Training Config
```yaml
stage: dpo
do_train: true
model_name_or_path: Qwen/Qwen2.5-3B
finetuning_type: lora
dataset: dpo_en               # needs chosen/rejected columns
template: qwen
dpo_beta: 0.1
dpo_loss: sigmoid             # sigmoid, hinge, ipo
```

## Export/Merge Config
```yaml
model_name_or_path: Qwen/Qwen2.5-3B
adapter_name_or_path: ./output/checkpoint-1000
template: qwen
finetuning_type: lora
export_dir: ./merged_model
export_size: 2                # shard size in GB
export_legacy_format: false
```

## Key Tips
- `template` must match the model's chat format (qwen, llama3, mistral, etc.)
- `lora_target: all` targets all linear layers (recommended for quality)
- Use `quantization_bit: 4` for QLoRA to fit large models on limited VRAM
- `cutoff_len` controls max sequence length — reduce for memory savings
- Always set `gradient_checkpointing: true` for models > 1B parameters
- Check supported models: Qwen, LLaMA, Mistral, Phi, ChatGLM, Baichuan, Yi, etc.


================================================
FILE: researchclaw/data/framework_docs/peft.md
================================================
# PEFT (Parameter-Efficient Fine-Tuning) — API Quick Reference

## Installation
```bash
pip install peft
```

## LoRA (Low-Rank Adaptation)
```python
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B", torch_dtype=torch.bfloat16)

lora_config = LoraConfig(
    r=16,                      # rank (4, 8, 16, 32, 64)
    lora_alpha=32,             # scaling factor (typically 2*r)
    lora_dropout=0.05,         # dropout probability
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    task_type=TaskType.CAUSAL_LM,
    bias="none",               # "none", "all", or "lora_only"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 13,631,488 || all params: 3,098,746,880 || trainable%: 0.44%
```

## QLoRA (Quantized LoRA)
```python
from transformers import BitsAndBytesConfig
import torch

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",          # "nf4" or "fp4"
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,      # nested quantization
)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B",
    quantization_config=bnb_config,
    device_map="auto",
)

# Then apply LoRA on top
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
```

## Saving and Loading
```python
# Save adapter only (small file)
model.save_pretrained("./lora_adapter")

# Load adapter
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B")
model = PeftModel.from_pretrained(base_model, "./lora_adapter")

# Merge adapter into base model (for deployment)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model")
```

## Common target_modules by Model Family
| Model | target_modules |
|-------|---------------|
| LLaMA/Qwen/Mistral | q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj |
| GPT-2/GPT-J | c_attn, c_proj, c_fc |
| BLOOM | query_key_value, dense, dense_h_to_4h, dense_4h_to_h |
| T5/Flan-T5 | q, v, k, o, wi, wo |
| Phi | q_proj, v_proj, dense, fc1, fc2 |

## DoRA (Weight-Decomposed Low-Rank Adaptation)
```python
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    use_dora=True,   # enables DoRA decomposition
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
)
```

## Key Tips
- Higher `r` = more parameters = more capacity but slower training
- `lora_alpha / r` is the effective scaling. Common: alpha=2*r
- For QLoRA: always use `prepare_model_for_kbit_training()` before `get_peft_model()`
- Target more modules (all linear layers) for better quality at marginal compute cost
- Use `modules_to_save=["lm_head", "embed_tokens"]` if you want to train the head too


================================================
FILE: researchclaw/data/framework_docs/transformers_training.md
================================================
# HuggingFace Transformers Training — API Quick Reference

## TrainingArguments (key parameters)
```python
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./output",
    # Training
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,      # effective batch = 8 * 4 = 32
    gradient_checkpointing=True,         # saves memory at ~20% speed cost
    # Optimizer
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,                    # or warmup_steps=100
    lr_scheduler_type="cosine",          # "linear", "cosine", "constant"
    optim="adamw_torch",                 # "adamw_torch", "adamw_8bit", "paged_adamw_8bit"
    # Precision
    bf16=True,                           # bfloat16 (Ampere+ GPUs)
    # fp16=True,                         # float16 (older GPUs)
    # Logging
    logging_steps=10,
    logging_strategy="steps",
    report_to="none",                    # "wandb", "tensorboard", "none"
    # Saving
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # Evaluation
    eval_strategy="epoch",
    # Other
    dataloader_num_workers=4,
    remove_unused_columns=True,
    seed=42,
    max_grad_norm=1.0,
)
```

## Trainer
```python
from transformers import Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,     # optional metric function
    data_collator=data_collator,         # optional custom collator
)

# Train
train_result = trainer.train()

# Evaluate
metrics = trainer.evaluate()

# Save
trainer.save_model("./best_model")
```

## Custom compute_metrics
```python
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }
```

## Tokenization / Data Preparation
```python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

tokenized_ds = dataset.map(tokenize_function, batched=True)
```

## Causal LM Training (GPT-style)
```python
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling

model = AutoModelForCausalLM.from_pretrained("gpt2")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM, not masked LM
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
)
```

## Key Tips
- `gradient_checkpointing=True` + `gradient_accumulation_steps` for memory efficiency
- `optim="paged_adamw_8bit"` reduces optimizer memory by ~50%
- `bf16=True` is preferred over `fp16=True` on Ampere+ GPUs (no loss scaling needed)
- Set `TOKENIZERS_PARALLELISM=false` to avoid fork warnings
- Use `model.config.use_cache = False` when using gradient_checkpointing


================================================
FILE: researchclaw/data/framework_docs/trl.md
================================================
# TRL (Transformer Reinforcement Learning) — API Quick Reference

## Installation
```bash
pip install trl
```

## SFTTrainer — Supervised Fine-Tuning
```python
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
dataset = load_dataset("json", data_files="train.jsonl", split="train")

# SFTConfig inherits from TrainingArguments
training_args = SFTConfig(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    max_seq_length=1024,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    gradient_checkpointing=True,
    # Dataset formatting
    dataset_text_field="text",           # column name for text
    packing=True,                         # pack short samples for efficiency
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=training_args,
)
trainer.train()
trainer.save_model("./final_model")
```

### Dataset Format Options
```python
# Option 1: "text" column (conversational or plain text)
# {"text": "### Human: question\n### Assistant: answer"}

# Option 2: "messages" column (chat format, auto-applies chat template)
# {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

# Option 3: formatting_func callback
def formatting_func(example):
    return f"### Question: {example['question']}\n### Answer: {example['answer']}"

trainer = SFTTrainer(
    ...,
    formatting_func=formatting_func,
)
```

## DPOTrainer — Direct Preference Optimization
```python
from trl import DPOTrainer, DPOConfig

training_args = DPOConfig(
    output_dir="./dpo_output",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-7,
    beta=0.1,                  # KL penalty coefficient
    max_length=1024,
    max_prompt_length=512,
    bf16=True,
    gradient_checkpointing=True,
    loss_type="sigmoid",       # "sigmoid" (default), "hinge", "ipo"
)

# Dataset must have columns: "prompt", "chosen", "rejected"
# OR "chosen" and "rejected" as full conversations
trainer = DPOTrainer(
    model=model,
    ref_model=None,            # None = use implicit reference (PEFT)
    train_dataset=dpo_dataset,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()
```

## GRPOTrainer — Group Relative Policy Optimization
```python
from trl import GRPOTrainer, GRPOConfig

training_args = GRPOConfig(
    output_dir="./grpo_output",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    learning_rate=1e-6,
    num_generations=4,          # samples per prompt for group scoring
    max_completion_length=256,
    bf16=True,
)

# Requires a reward function
def reward_fn(completions, prompts):
    # Return list of float scores
    return [score_completion(c) for c in completions]

trainer = GRPOTrainer(
    model=model,
    reward_funcs=reward_fn,
    train_dataset=dataset,      # needs "prompt" column
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()
```

## PPOTrainer — Proximal Policy Optimization for RLHF
```python
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

config = PPOConfig(
    model_name="Qwen/Qwen2.5-3B",
    learning_rate=1.41e-5,
    batch_size=16,
    mini_batch_size=4,
    ppo_epochs=4,
    gradient_accumulation_steps=1,
)

model = AutoModelForCausalLMWithValueHead.from_pretrained("Qwen/Qwen2.5-3B")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")

trainer = PPOTrainer(
    config=config,
    model=model,
    tokenizer=tokenizer,
    dataset=dataset,
)

# Training loop
for batch in trainer.dataloader:
    query_tensors = batch["input_ids"]
    response_tensors = trainer.generate(query_tensors, max_new_tokens=128)
    rewards = [reward_model(q, r) for q, r in zip(query_tensors, response_tensors)]
    stats = trainer.step(query_tensors, response_tensors, rewards)
```

## Integration with PEFT/LoRA
```python
from peft import LoraConfig

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# Pass peft_config to any TRL trainer
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,   # automatically wraps model with LoRA
    ...,
)
```

## Key Tips
- Always set `tokenizer.pad_token = tokenizer.eos_token` if pad_token is None
- Use `gradient_checkpointing=True` for memory efficiency
- Use `bf16=True` on Ampere+ GPUs (A100, RTX 3090+, RTX 4090, RTX 6000 Ada)
- For multi-GPU: TRL uses accelerate under the hood, just launch with `accelerate launch`
- `packing=True` in SFTConfig significantly speeds up training on short samples


================================================
FILE: researchclaw/data/seminal_papers.yaml
================================================
# Foundational ML papers that should be cited when discussing specific topics.
# Indexed by keyword/technique for automatic injection into literature candidates.
# Format: each entry has title, authors, year, venue, cite_key, and keywords.

papers:
  # --- Normalization ---
  - title: "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift"
    authors: "Ioffe and Szegedy"
    year: 2015
    venue: "ICML"
    cite_key: "ioffe2015batch"
    keywords: ["batch normalization", "normalization", "BN", "internal covariate shift"]

  - title: "Layer Normalization"
    authors: "Ba et al."
    year: 2016
    venue: "arXiv"
    cite_key: "ba2016layer"
    keywords: ["layer normalization", "normalization", "LN"]

  - title: "Group Normalization"
    authors: "Wu and He"
    year: 2018
    venue: "ECCV"
    cite_key: "wu2018group"
    keywords: ["group normalization", "normalization", "GN"]

  - title: "Instance Normalization: The Missing Ingredient for Fast Stylization"
    authors: "Ulyanov et al."
    year: 2016
    venue: "arXiv"
    cite_key: "ulyanov2016instance"
    keywords: ["instance normalization", "normalization", "style transfer"]

  # --- Residual Networks ---
  - title: "Deep Residual Learning for Image Recognition"
    authors: "He et al."
    year: 2016
    venue: "CVPR"
    cite_key: "he2016deep"
    keywords: ["ResNet", "residual network", "residual learning", "skip connection", "deep learning"]

  - title: "Identity Mappings in Deep Residual Networks"
    authors: "He et al."
    year: 2016
    venue: "ECCV"
    cite_key: "he2016identity"
    keywords: ["ResNet", "residual network", "pre-activation"]

  # --- Transformers / Attention ---
  - title: "Attention Is All You Need"
    authors: "Vaswani et al."
    year: 2017
    venue: "NeurIPS"
    cite_key: "vaswani2017attention"
    keywords: ["transformer", "attention", "self-attention", "multi-head attention"]

  - title: "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale"
    authors: "Dosovitskiy et al."
    year: 2021
    venue: "ICLR"
    cite_key: "dosovitskiy2021image"
    keywords: ["ViT", "vision transformer", "image transformer"]

  - title: "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
    authors: "Devlin et al."
    year: 2019
    venue: "NAACL"
    cite_key: "devlin2019bert"
    keywords: ["BERT", "pre-training", "language model", "NLP", "transformer"]

  # --- Optimization ---
  - title: "Adam: A Method for Stochastic Optimization"
    authors: "Kingma and Ba"
    year: 2015
    venue: "ICLR"
    cite_key: "kingma2015adam"
    keywords: ["Adam", "optimizer", "stochastic optimization", "adaptive learning rate"]

  - title: "Decoupled Weight Decay Regularization"
    authors: "Loshchilov and Hutter"
    year: 2019
    venue: "ICLR"
    cite_key: "loshchilov2019decoupled"
    keywords: ["AdamW", "weight decay", "optimizer", "regularization"]

  - title: "SGD with Momentum"
    authors: "Sutskever et al."
    year: 2013
    venue: "ICML"
    cite_key: "sutskever2013importance"
    keywords: ["SGD", "momentum", "optimization", "learning rate"]

  # --- Regularization ---
  - title: "Dropout: A Simple Way to Prevent Neural Networks from Overfitting"
    authors: "Srivastava et al."
    year: 2014
    venue: "JMLR"
    cite_key: "srivastava2014dropout"
    keywords: ["dropout", "regularization", "overfitting"]

  # --- Distribution Shift / Domain Adaptation ---
  - title: "Benchmarking Neural Network Robustness to Common Corruptions and Perturbations"
    authors: "Hendrycks and Dietterich"
    year: 2019
    venue: "ICLR"
    cite_key: "hendrycks2019benchmarking"
    keywords: ["distribution shift", "corruption", "robustness", "CIFAR-C", "ImageNet-C"]

  - title: "Domain Adaptation for Object Recognition: An Unsupervised Approach"
    authors: "Saenko et al."
    year: 2010
    venue: "ECCV"
    cite_key: "saenko2010adapting"
    keywords: ["domain adaptation", "transfer learning", "distribution shift"]

  - title: "In Search of Lost Domain Generalization"
    authors: "Gulrajani and Lopez-Paz"
    year: 2021
    venue: "ICLR"
    cite_key: "gulrajani2021search"
    keywords: ["domain generalization", "distribution shift", "benchmark"]

  # --- Reinforcement Learning ---
  - title: "Proximal Policy Optimization Algorithms"
    authors: "Schulman et al."
    year: 2017
    venue: "arXiv"
    cite_key: "schulman2017proximal"
    keywords: ["PPO", "policy gradient", "reinforcement learning", "RL"]

  - title: "Playing Atari with Deep Reinforcement Learning"
    authors: "Mnih et al."
    year: 2013
    venue: "NIPS Workshop"
    cite_key: "mnih2013playing"
    keywords: ["DQN", "deep reinforcement learning", "Atari", "RL"]

  - title: "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor"
    authors: "Haarnoja et al."
    year: 2018
    venue: "ICML"
    cite_key: "haarnoja2018soft"
    keywords: ["SAC", "soft actor-critic", "entropy", "RL", "continuous control"]

  # --- Generative Models ---
  - title: "Generative Adversarial Nets"
    authors: "Goodfellow et al."
    year: 2014
    venue: "NeurIPS"
    cite_key: "goodfellow2014generative"
    keywords: ["GAN", "generative adversarial", "generative model"]

  - title: "Denoising Diffusion Probabilistic Models"
    authors: "Ho et al."
    year: 2020
    venue: "NeurIPS"
    cite_key: "ho2020denoising"
    keywords: ["diffusion model", "denoising", "generative model", "DDPM"]

  - title: "Auto-Encoding Variational Bayes"
    authors: "Kingma and Welling"
    year: 2014
    venue: "ICLR"
    cite_key: "kingma2014auto"
    keywords: ["VAE", "variational autoencoder", "generative model", "latent variable"]

  # --- Graph Neural Networks ---
  - title: "Semi-Supervised Classification with Graph Convolutional Networks"
    authors: "Kipf and Welling"
    year: 2017
    venue: "ICLR"
    cite_key: "kipf2017semi"
    keywords: ["GCN", "graph neural network", "graph convolution", "GNN"]

  - title: "Neural Ordinary Differential Equations"
    authors: "Chen et al."
    year: 2018
    venue: "NeurIPS"
    cite_key: "chen2018neural"
    keywords: ["neural ODE", "ODE", "differential equation", "continuous depth"]

  # --- Continual Learning ---
  - title: "Overcoming Catastrophic Forgetting in Neural Networks"
    authors: "Kirkpatrick et al."
    year: 2017
    venue: "PNAS"
    cite_key: "kirkpatrick2017overcoming"
    keywords: ["EWC", "elastic weight consolidation", "continual learning", "catastrophic forgetting"]

  - title: "Continual Lifelong Learning with Neural Networks: A Review"
    authors: "Parisi et al."
    year: 2019
    venue: "Neural Networks"
    cite_key: "parisi2019continual"
    keywords: ["continual learning", "lifelong learning", "catastrophic forgetting"]

  # --- Meta-Learning ---
  - title: "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks"
    authors: "Finn et al."
    year: 2017
    venue: "ICML"
    cite_key: "finn2017model"
    keywords: ["MAML", "meta-learning", "few-shot", "fast adaptation"]

  # --- Contrastive / Self-supervised ---
  - title: "A Simple Framework for Contrastive Learning of Visual Representations"
    authors: "Chen et al."
    year: 2020
    venue: "ICML"
    cite_key: "chen2020simple"
    keywords: ["SimCLR", "contrastive learning", "self-supervised", "representation learning"]

  # --- Multi-agent Systems ---
  - title: "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments"
    authors: "Lowe et al."
    year: 2017
    venue: "NeurIPS"
    cite_key: "lowe2017multi"
    keywords: ["multi-agent", "MADDPG", "cooperative", "competitive", "multi-agent RL"]

  # --- Bayesian Optimization ---
  - title: "A Tutorial on Bayesian Optimization"
    authors: "Frazier"
    year: 2018
    venue: "arXiv"
    cite_key: "frazier2018tutorial"
    keywords: ["Bayesian optimization", "surrogate model", "acquisition function", "hyperparameter tuning"]


================================================
FILE: researchclaw/docker/Dockerfile
================================================
# ResearchClaw experiment sandbox — GPU-enabled, isolated environment.
# Build: docker build -t researchclaw/experiment:latest researchclaw/docker/
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

# System packages: Python 3.11, build tools, common native deps
RUN apt-get update && apt-get install -y --no-install-recommends \
        python3.11 python3.11-dev python3.11-venv python3-pip \
        gcc g++ cmake make gfortran \
        libopenblas-dev liblapack-dev \
        git curl ca-certificates \
        iptables iproute2 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \
    && python3 -m pip install --upgrade pip setuptools wheel \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# Core ML/scientific Python stack
RUN python3 -m pip install \
        torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
    && python3 -m pip install \
        numpy scipy scikit-learn pandas matplotlib seaborn \
        tqdm gymnasium networkx torchdiffeq PyYAML \
    && python3 -m pip install "gymnasium[mujoco]" "gymnasium[box2d]" mujoco

# Extended ML ecosystem — vision, metrics, utilities
RUN python3 -m pip install \
        timm einops torchmetrics Pillow \
        h5py tensorboard albumentations kornia

# LLM / Transformer training stack
RUN python3 -m pip install \
        transformers>=4.46.0 \
        datasets>=3.0.0 \
        accelerate>=1.0.0 \
        peft>=0.13.0 \
        trl>=0.12.0 \
        bitsandbytes>=0.44.0 \
        sentencepiece protobuf tokenizers safetensors \
        evaluate rouge-score

# Pre-cache standard datasets for offline use
RUN mkdir -p /opt/datasets && \
    python3 -c "import torchvision; torchvision.datasets.CIFAR10(root='/opt/datasets', train=True, download=True); torchvision.datasets.CIFAR10(root='/opt/datasets', train=False, download=True)" && \
    python3 -c "import torchvision; torchvision.datasets.CIFAR100(root='/opt/datasets', train=True, download=True); torchvision.datasets.CIFAR100(root='/opt/datasets', train=False, download=True)" && \
    python3 -c "import torchvision; torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True, download=True); torchvision.datasets.FashionMNIST(root='/opt/datasets', train=False, download=True)" && \
    python3 -c "import torchvision; torchvision.datasets.MNIST(root='/opt/datasets', train=True, download=True); torchvision.datasets.MNIST(root='/opt/datasets', train=False, download=True)" && \
    python3 -c "import torchvision; torchvision.datasets.STL10(root='/opt/datasets', split='train', download=True); torchvision.datasets.STL10(root='/opt/datasets', split='test', download=True)" && \
    python3 -c "import torchvision; torchvision.datasets.SVHN(root='/opt/datasets', split='train', download=True); torchvision.datasets.SVHN(root='/opt/datasets', split='test', download=True)" && \
    chmod -R a+r /opt/datasets

# Non-root user for security
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

# Entrypoint script — three-phase execution (pip install → setup → experiment)
COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher

# Default: run entrypoint wrapper with main.py as the experiment script
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.biology
================================================
# ResearchClaw Biology sandbox — bioinformatics, single-cell analysis.
# Build: docker build -f Dockerfile.biology -t researchclaw/sandbox-biology:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ \
        libhdf5-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas \
        scikit-learn tqdm PyYAML h5py

# Biology packages
RUN pip install \
        scanpy anndata leidenalg \
        biopython

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.chemistry
================================================
# ResearchClaw Chemistry sandbox — quantum chemistry, molecular property.
# Build: docker build -f Dockerfile.chemistry -t researchclaw/sandbox-chemistry:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ gfortran cmake \
        libopenblas-dev liblapack-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas \
        tqdm PyYAML

# Chemistry packages
RUN pip install pyscf && \
    pip install rdkit-pypi || true

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.economics
================================================
# ResearchClaw Economics sandbox — econometrics, regression analysis.
# Build: docker build -f Dockerfile.economics -t researchclaw/sandbox-economics:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas seaborn \
        statsmodels linearmodels \
        tqdm PyYAML

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.generic
================================================
# ResearchClaw Generic sandbox — lightweight Python scientific stack.
# Build: docker build -f Dockerfile.generic -t researchclaw/sandbox-generic:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas \
        scikit-learn tqdm PyYAML seaborn

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.math
================================================
# ResearchClaw Math sandbox — numerical methods, optimization.
# Build: docker build -f Dockerfile.math -t researchclaw/sandbox-math:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ gfortran \
        libopenblas-dev liblapack-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas \
        sympy findiff tqdm PyYAML

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/Dockerfile.physics
================================================
# ResearchClaw Physics sandbox — CPU-focused, numerical computing.
# Build: docker build -f Dockerfile.physics -t researchclaw/sandbox-physics:latest researchclaw/docker/
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc g++ gfortran cmake make \
        libopenblas-dev liblapack-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# Core scientific Python stack
RUN pip install --upgrade pip setuptools wheel && \
    pip install \
        numpy scipy matplotlib pandas seaborn \
        tqdm PyYAML h5py

# Physics-specific packages
RUN pip install \
        findiff \
        sympy

# JAX (CPU only for physics sandbox)
RUN pip install \
        jax jaxlib

# Non-root user
RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher
WORKDIR /workspace
RUN chown researcher:researcher /workspace

COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh
RUN chmod +x /usr/local/bin/rc-entrypoint.sh

USER researcher
ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"]
CMD ["main.py"]


================================================
FILE: researchclaw/docker/entrypoint.sh
================================================
#!/bin/bash
# ResearchClaw experiment entrypoint — unified three-phase execution.
#
# Phase 0: pip install from requirements.txt (if present)
# Phase 1: Run setup.py for dataset downloads / preparation (if present)
# Phase 2: Run the main experiment script
#
# Environment variables:
#   RC_SETUP_ONLY_NETWORK=1  — disable network after Phase 1 (iptables/route)
#   RC_ENTRY_POINT           — override entry point (default: first CLI arg or main.py)
set -e

WORKSPACE="/workspace"
ENTRY_POINT="${1:-main.py}"

# ----------------------------------------------------------------
# Phase 0: Install additional pip packages
# ----------------------------------------------------------------
if [ -f "$WORKSPACE/requirements.txt" ]; then
    echo "[RC] Phase 0: Installing packages from requirements.txt..."
    pip install --no-cache-dir --break-system-packages \
        -r "$WORKSPACE/requirements.txt" 2>&1 | tail -20
    echo "[RC] Phase 0: Package installation complete."
fi

# ----------------------------------------------------------------
# Phase 1: Run setup script (dataset download / preparation)
# ----------------------------------------------------------------
if [ -f "$WORKSPACE/setup.py" ]; then
    echo "[RC] Phase 1: Running setup.py (dataset download/preparation)..."
    python3 -u "$WORKSPACE/setup.py"
    echo "[RC] Phase 1: Setup complete."
fi

# ----------------------------------------------------------------
# Network cutoff (if setup_only policy)
# ----------------------------------------------------------------
if [ "${RC_SETUP_ONLY_NETWORK:-0}" = "1" ]; then
    echo "[RC] Disabling network for experiment phase..."
    # Try iptables first (requires NET_ADMIN capability)
    if iptables -A OUTPUT -j DROP 2>/dev/null; then
        echo "[RC] Network disabled via iptables."
    elif ip route del default 2>/dev/null; then
        echo "[RC] Network disabled via route removal."
    else
        echo "[RC] Warning: Could not disable network (no NET_ADMIN cap or ip route). Continuing with network."
    fi
fi

# ----------------------------------------------------------------
# Phase 2: Run experiment
# ----------------------------------------------------------------
echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
exec python3 -u "$WORKSPACE/$ENTRY_POINT"


================================================
FILE: researchclaw/domains/__init__.py
================================================
"""Universal cross-domain research code generation framework.

This package provides domain detection, prompt adaptation, and experiment
schema generalization so the pipeline can generate code for any
computational research domain — not just ML/AI.
"""

from researchclaw.domains.detector import DomainProfile, detect_domain

__all__ = ["DomainProfile", "detect_domain"]


================================================
FILE: researchclaw/domains/adapters/__init__.py
================================================
"""Domain-specific prompt adapters.

Each adapter customizes prompt blocks for a specific research domain
while the ML adapter preserves existing behavior unchanged.
"""

from researchclaw.domains.adapters.ml import MLPromptAdapter
from researchclaw.domains.adapters.generic import GenericPromptAdapter
from researchclaw.domains.adapters.physics import PhysicsPromptAdapter
from researchclaw.domains.adapters.economics import EconomicsPromptAdapter
from researchclaw.domains.adapters.biology import BiologyPromptAdapter
from researchclaw.domains.adapters.chemistry import ChemistryPromptAdapter
from researchclaw.domains.adapters.neuroscience import NeurosciencePromptAdapter
from researchclaw.domains.adapters.robotics import RoboticsPromptAdapter

__all__ = [
    "MLPromptAdapter",
    "GenericPromptAdapter",
    "PhysicsPromptAdapter",
    "EconomicsPromptAdapter",
    "BiologyPromptAdapter",
    "ChemistryPromptAdapter",
    "NeurosciencePromptAdapter",
    "RoboticsPromptAdapter",
]


================================================
FILE: researchclaw/domains/adapters/biology.py
================================================
"""Biology domain prompt adapter.

Provides domain-specific prompt blocks for bioinformatics
experiments (single-cell analysis, genomics, protein science).
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class BiologyPromptAdapter(PromptAdapter):
    """Adapter for biology/bioinformatics domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance or (
                "Bioinformatics analyses can be memory-intensive:\n"
                "- Use small/subsampled datasets for testing\n"
                "- Single-cell: cap at 5000 cells for benchmarks\n"
                "- Genomics: use small chromosomes/regions"
            ),
            dataset_guidance=domain.dataset_guidance or (
                "Generate synthetic biological data in code:\n"
                "- Single-cell: use scanpy.datasets or simulate with splatter\n"
                "- Genomics: generate synthetic sequences\n"
                "- Do NOT download external datasets"
            ),
            hp_reporting=domain.hp_reporting_guidance or (
                "Report analysis parameters:\n"
                "HYPERPARAMETERS: {'n_cells': ..., 'n_genes': ..., "
                "'n_hvg': ..., 'n_pcs': ..., 'resolution': ...}"
            ),
            code_generation_hints=domain.code_generation_hints or self._default_hints(),
            output_format_guidance=(
                "Output results to results.json:\n"
                '{"conditions": {"method": {"ARI": 0.85, "NMI": 0.82}},\n'
                ' "metadata": {"domain": "biology_singlecell"}}'
            ),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is a **{domain.display_name}** experiment.\n\n"
            "Key principles:\n"
            "1. Proper preprocessing is critical (QC, normalization)\n"
            "2. Use standard evaluation metrics (ARI, NMI for clustering)\n"
            "3. Compare against established methods in the field\n"
            "4. Include sensitivity analysis for key parameters\n"
        )

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=(
                "Use Wilcoxon rank-sum test with FDR correction "
                "for differential expression. Use ARI/NMI for clustering."
            ),
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=(
                "Biology result analysis:\n"
                "- Clustering: ARI, NMI, silhouette score\n"
                "- DE: number of DEGs at FDR < 0.05\n"
                "- Trajectory: pseudotime correlation\n"
                "- Report runtime alongside quality metrics"
            ),
        )

    def _default_hints(self) -> str:
        return (
            "Bioinformatics code requirements:\n"
            "1. Use scanpy for single-cell analysis\n"
            "2. Standard pipeline: load → QC → normalize → log1p → HVG → PCA → neighbors\n"
            "3. Compare clustering methods (Leiden, Louvain, K-means)\n"
            "4. Evaluate with ARI against known cell types\n"
            "5. Output results to results.json\n"
        )


================================================
FILE: researchclaw/domains/adapters/chemistry.py
================================================
"""Chemistry domain prompt adapter."""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class ChemistryPromptAdapter(PromptAdapter):
    """Adapter for chemistry domains (quantum chemistry, molecular property)."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance or (
                "Quantum chemistry calculations can be slow:\n"
                "- Use small basis sets for testing (STO-3G)\n"
                "- Limit molecule size (< 20 atoms)\n"
                "- DFT is faster than post-HF methods"
            ),
            dataset_guidance=domain.dataset_guidance or (
                "Define molecular systems in code:\n"
                "- Atomic coordinates and basis sets\n"
                "- Standard test molecules (H2, H2O, CH4)\n"
                "- Do NOT download external datasets"
            ),
            code_generation_hints=domain.code_generation_hints or self._default_hints(),
            output_format_guidance=(
                "Output results to results.json:\n"
                '{"conditions": {"method": {"energy_hartree": -1.13, "error_kcal_mol": 0.5}},\n'
                ' "metadata": {"domain": "chemistry_qm"}}'
            ),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is a **{domain.display_name}** experiment.\n\n"
            "Key principles:\n"
            "1. Use well-defined molecular test sets\n"
            "2. Compare against high-level reference (CCSD(T) or experimental)\n"
            "3. Report energies in Hartree, errors in kcal/mol\n"
            "4. Vary basis set for convergence if applicable\n"
        )

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance="Use MAE, RMSE, max error for method comparison.",
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=(
                "Chemistry result analysis:\n"
                "- Report MAE, RMSE in kcal/mol against reference\n"
                "- 'Chemical accuracy' = MAE < 1 kcal/mol\n"
                "- Compare computation time vs accuracy trade-off"
            ),
        )

    def _default_hints(self) -> str:
        if self.domain.domain_id == "chemistry_qm":
            return (
                "Quantum chemistry code with PySCF:\n"
                "1. mol = gto.M(atom='...', basis='sto-3g')\n"
                "2. mf = scf.RHF(mol); mf.kernel()\n"
                "3. For post-HF: mp2 = mp.MP2(mf); mp2.kernel()\n"
                "4. Compare methods on same molecule set\n"
                "5. Energy conversion: 1 Ha = 627.509 kcal/mol\n"
                "6. Output results.json\n"
            )
        return (
            "Molecular property prediction:\n"
            "1. Define molecules via SMILES strings\n"
            "2. Use RDKit for featurization\n"
            "3. Train/test split on molecular data\n"
            "4. Compare ML models (RF, XGBoost, GCN)\n"
            "5. Output results.json with MAE, RMSE\n"
        )


================================================
FILE: researchclaw/domains/adapters/economics.py
================================================
"""Economics domain prompt adapter.

Provides domain-specific prompt blocks for empirical economics
experiments (regression analysis, causal inference, panel data).
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class EconomicsPromptAdapter(PromptAdapter):
    """Adapter for economics domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance or (
                "Economics regressions are fast. Focus on:\n"
                "- Multiple specifications (4-6 columns)\n"
                "- Bootstrap SE if needed (100-500 reps)\n"
                "- Cluster-robust SE for panel data"
            ),
            dataset_guidance=domain.dataset_guidance or (
                "Generate synthetic data with known treatment effect (DGP):\n"
                "- Include treatment, outcome, controls, fixed effects\n"
                "- Simulate realistic correlations and confounders\n"
                "- Do NOT download external datasets"
            ),
            hp_reporting=domain.hp_reporting_guidance or (
                "Report specification details:\n"
                "HYPERPARAMETERS: {'n_obs': ..., 'n_controls': ..., "
                "'true_effect': ..., 'fe_groups': ..., 'cluster_var': ...}"
            ),
            code_generation_hints=domain.code_generation_hints or self._default_hints(),
            output_format_guidance=self._output_format(),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is an **{domain.display_name}** experiment.\n"
            f"Paradigm: progressive specification\n\n"
            "Key principles for economics experiments:\n"
            "1. Start simple (OLS), add complexity progressively\n"
            "2. Report each specification as a column in a regression table\n"
            "3. Use robust/clustered standard errors\n"
            "4. Include at least one robustness check\n"
            "5. Data should be generated with a known DGP for validation\n"
        )

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=(
                "Use Hausman test for FE vs RE choice, "
                "F-test for joint significance, "
                "robust/clustered SE for inference."
            ),
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=(
                "Economics result analysis:\n"
                "- Compare coefficient estimates across specifications\n"
                "- Check if treatment effect is robust to controls/FE\n"
                "- Report significance levels (*/**/***)\n"
                "- Discuss economic magnitude, not just statistical significance"
            ),
            statistical_test_guidance=(
                "Use Hausman test, robust SE, cluster SE. "
                "Report R², N, F-statistic for each specification."
            ),
        )

    def _default_hints(self) -> str:
        return (
            "Economics code requirements:\n"
            "1. Generate synthetic data with statsmodels/numpy\n"
            "2. Implement progressive specifications:\n"
            "   - Spec 1: Simple OLS (Y ~ treatment)\n"
            "   - Spec 2: OLS + controls (Y ~ treatment + X1 + X2)\n"
            "   - Spec 3: Fixed effects (Y ~ treatment + X + entity FE)\n"
            "   - Spec 4: IV / 2SLS if applicable\n"
            "3. Use robust/clustered standard errors\n"
            "4. Output regression table to results.json\n"
            "5. Use linearmodels for panel FE, statsmodels for OLS/IV\n"
        )

    def _output_format(self) -> str:
        return (
            "Output regression table to results.json:\n"
            '{"regression_table": {\n'
            '    "spec_1_ols": {"coeff": 0.15, "se": 0.03, "p": 0.001, "n": 5000, "r2": 0.12},\n'
            '    "spec_2_controls": {"coeff": 0.12, "se": 0.02, "p": 0.001, "n": 5000, "r2": 0.25}\n'
            '},\n'
            ' "metadata": {"domain": "economics_empirical", "total_runtime_sec": ...}}'
        )


================================================
FILE: researchclaw/domains/adapters/generic.py
================================================
"""Generic domain adapter — fallback for unknown/new domains.

Re-exports GenericPromptAdapter from prompt_adapter.py so that the
adapters package has a consistent interface.
"""

from researchclaw.domains.prompt_adapter import GenericPromptAdapter

__all__ = ["GenericPromptAdapter"]


================================================
FILE: researchclaw/domains/adapters/math.py
================================================
"""Mathematics domain prompt adapter."""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class MathPromptAdapter(PromptAdapter):
    """Adapter for numerical mathematics and optimization domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        paradigm = domain.experiment_paradigm

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance or (
                "Numerical methods are typically fast.\n"
                "Use 5-8 refinement levels for convergence plots.\n"
                "Step sizes: geometric sequence (h, h/2, h/4, ...)"
            ),
            dataset_guidance=domain.dataset_guidance or (
                "Use standard test problems with known solutions:\n"
                "- ODE: Lotka-Volterra, Van der Pol, stiff systems\n"
                "- Quadrature: smooth, oscillatory, singular integrands\n"
                "- Linear algebra: Hilbert matrix, tridiagonal\n"
                "- Do NOT download external datasets"
            ),
            code_generation_hints=domain.code_generation_hints or self._hints(paradigm),
            output_format_guidance=self._output_format(paradigm),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            experiment_design_context=(
                f"This is a **{self.domain.display_name}** experiment.\n"
                "Focus on:\n"
                "1. Correctness (verify against known solutions)\n"
                "2. Convergence order (expected vs observed)\n"
                "3. Efficiency (operations count, wall time)\n"
            ),
            statistical_test_guidance="Use convergence order fitting for accuracy analysis.",
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=(
                "Numerical methods analysis:\n"
                "- Convergence: fit log(error) vs log(h)\n"
                "- Stability: check for growth in error over long runs\n"
                "- Efficiency: compare accuracy per unit computation"
            ),
        )

    def _hints(self, paradigm: str) -> str:
        if paradigm == "convergence":
            return (
                "Numerical methods convergence study:\n"
                "1. Implement methods from scratch (not just scipy wrappers)\n"
                "2. Use test problems with KNOWN exact solutions\n"
                "3. Run at 5+ refinement levels\n"
                "4. Compute error: ||u_h - u_exact||_2\n"
                "5. Report convergence order: p = log(e_h / e_{h/2}) / log(2)\n"
                "6. Output results.json with convergence data"
            )
        return (
            "Numerical/optimization code:\n"
            "1. Implement algorithms from scratch\n"
            "2. Test on standard benchmark functions\n"
            "3. Compare accuracy and efficiency\n"
            "4. Output results.json"
        )

    def _output_format(self, paradigm: str) -> str:
        if paradigm == "convergence":
            return (
                "Output convergence results to results.json:\n"
                '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]}}'
            )
        return (
            "Output results to results.json:\n"
            '{"conditions": {"optimizer": {"iterations": 100, "final_value": 0.001}}}'
        )


================================================
FILE: researchclaw/domains/adapters/ml.py
================================================
"""ML domain prompt adapter — preserves existing behavior exactly.

This adapter returns empty PromptBlocks for all stages, which signals
the pipeline to use the existing hardcoded ML behavior in prompts.py.
This is the **zero-regression guarantee** for ML functionality.
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class MLPromptAdapter(PromptAdapter):
    """ML adapter: delegates to existing prompts.py behavior unchanged."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks()

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks()

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks()


================================================
FILE: researchclaw/domains/adapters/neuroscience.py
================================================
"""Computational neuroscience domain prompt adapter.

Provides domain-specific prompt blocks for neural simulation
experiments (spiking networks, neural dynamics, population coding,
brain imaging analysis).
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class NeurosciencePromptAdapter(PromptAdapter):
    """Adapter for computational neuroscience domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        paradigm = domain.experiment_paradigm
        libs = (
            ", ".join(domain.core_libraries)
            if domain.core_libraries
            else "numpy, scipy, brian2"
        )

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance
            or self._default_compute_budget(),
            dataset_guidance=domain.dataset_guidance
            or self._default_dataset_guidance(),
            hp_reporting=domain.hp_reporting_guidance
            or self._default_hp_reporting(),
            code_generation_hints=domain.code_generation_hints
            or self._default_code_hints(paradigm),
            output_format_guidance=self._output_format(paradigm),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is a **{domain.display_name}** experiment.\n"
            f"Paradigm: {domain.experiment_paradigm}\n\n"
            "Key principles for neuroscience simulations:\n"
            "1. Use biologically plausible parameters (membrane time constants, "
            "synaptic weights, firing rates)\n"
            "2. Validate single-neuron dynamics before scaling to networks\n"
            "3. Report spike statistics: firing rate, CV of ISI, Fano factor\n"
            "4. For network models, specify connectivity (E/I ratio, sparsity)\n"
            "5. Compare against established neuron models as baselines\n"
        )

        if domain.standard_baselines:
            design_context += (
                f"\nStandard reference models: "
                f"{', '.join(domain.standard_baselines)}\n"
            )

        stats = (
            ", ".join(domain.statistical_tests)
            if domain.statistical_tests
            else "paired t-test, KS test"
        )

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=(
                f"Use {stats} for result analysis. "
                "For spike train comparison, use van Rossum distance "
                "or SPIKE-distance when appropriate."
            ),
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=self.domain.result_analysis_hints
            or (
                "Neuroscience result analysis:\n"
                "- Firing rate: mean ± std across neurons and trials\n"
                "- Regularity: CV of inter-spike intervals (CV < 1 regular, "
                "CV ≈ 1 Poisson-like)\n"
                "- Synchrony: pairwise spike-count correlation\n"
                "- Population: Fano factor, dimensionality\n"
                "- Decoding: accuracy with cross-validation, information (bits)\n"
                "- Use raster plots and PSTHs for visualization"
            ),
            statistical_test_guidance=(
                "Use paired t-test or permutation test for firing rate "
                "comparisons. Use KS test for ISI distribution comparisons."
            ),
        )

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _default_code_hints(self, paradigm: str) -> str:
        if paradigm == "simulation":
            return (
                "Neural simulation code:\n"
                "1. Define neuron model (LIF, Izhikevich, or Hodgkin-Huxley)\n"
                "2. Set biologically plausible parameters "
                "(tau_m=20ms, V_thresh=-50mV, V_reset=-65mV for LIF)\n"
                "3. Generate input stimulus (Poisson spikes or step current)\n"
                "4. Run simulation, record spikes and membrane potential\n"
                "5. Compute spike statistics: rate, CV ISI, Fano factor\n"
                "6. Compare multiple models on the same stimulus\n"
                "7. Output results.json with comparison data\n"
            )
        return (
            "Neural analysis code:\n"
            "1. Generate or load neural activity data\n"
            "2. Preprocess: spike sorting / binning / filtering\n"
            "3. Compute relevant metrics\n"
            "4. Compare methods on the same data\n"
            "5. Output results to results.json\n"
        )

    def _default_compute_budget(self) -> str:
        return (
            "Time budget for neural simulations:\n"
            "- Single neuron models: very fast, run many trials\n"
            "- Small networks (< 1000 neurons): seconds per run\n"
            "- Large networks: use vectorized code or Brian2\n"
            "- Keep biological time reasonable (100ms–10s)"
        )

    def _default_dataset_guidance(self) -> str:
        return (
            "Neuroscience experiments generate data programmatically:\n"
            "- Define neuron parameters and connectivity in code\n"
            "- Generate Poisson spike trains for inputs\n"
            "- Use standard test circuits (E/I balanced, WTA)\n"
            "- Do NOT download external neural datasets\n"
            "- For brain imaging: generate synthetic fMRI/EEG signals"
        )

    def _default_hp_reporting(self) -> str:
        return (
            "Report simulation parameters:\n"
            "HYPERPARAMETERS: {'n_neurons': ..., 'tau_m_ms': ..., "
            "'v_thresh_mV': ..., 'sim_duration_ms': ..., 'dt_ms': ..., "
            "'connectivity': ...}"
        )

    def _output_format(self, paradigm: str) -> str:
        return (
            "Output neural simulation results to results.json:\n"
            '{"conditions": {"model_name": {"firing_rate_hz": 12.5, '
            '"cv_isi": 0.85, "fano_factor": 1.02}},\n'
            ' "metadata": {"domain": "neuroscience_computational", '
            '"sim_duration_ms": ..., "n_neurons": ...}}'
        )


================================================
FILE: researchclaw/domains/adapters/physics.py
================================================
"""Physics domain prompt adapter.

Provides domain-specific prompt blocks for computational physics
experiments (simulations, PDE solvers, convergence studies).
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class PhysicsPromptAdapter(PromptAdapter):
    """Adapter for physics domains (simulation, PDE, quantum)."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        paradigm = domain.experiment_paradigm
        libs = ", ".join(domain.core_libraries) if domain.core_libraries else "numpy, scipy"

        code_hints = domain.code_generation_hints or self._default_code_hints(paradigm)

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance or self._default_compute_budget(),
            dataset_guidance=domain.dataset_guidance or self._default_dataset_guidance(),
            hp_reporting=domain.hp_reporting_guidance or self._default_hp_reporting(),
            code_generation_hints=code_hints,
            output_format_guidance=self._output_format(paradigm),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is a **{domain.display_name}** experiment.\n"
            f"Paradigm: {domain.experiment_paradigm}\n\n"
            "Key principles for physics experiments:\n"
            "1. Conservation laws must be respected (energy, momentum, etc.)\n"
            "2. Use appropriate units (reduced units for MD, SI otherwise)\n"
            "3. Validate against known analytical solutions when possible\n"
            "4. For convergence: vary grid size/timestep systematically\n"
        )

        if domain.standard_baselines:
            design_context += f"\nStandard reference methods: {', '.join(domain.standard_baselines)}\n"

        stats = ", ".join(domain.statistical_tests) if domain.statistical_tests else "convergence order analysis"

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=f"Use {stats} for result analysis.",
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=self.domain.result_analysis_hints or (
                "Physics result analysis:\n"
                "- Convergence: fit log(error) vs log(h) for order\n"
                "- Conservation: track energy/momentum drift\n"
                "- Accuracy: compare with analytical solutions\n"
                "- Use log-log plots for convergence studies"
            ),
            statistical_test_guidance="Use convergence order fitting and relative error analysis.",
        )

    def _default_code_hints(self, paradigm: str) -> str:
        if paradigm == "convergence":
            return (
                "This is a convergence study:\n"
                "1. Implement the numerical method(s)\n"
                "2. Run at 5+ refinement levels (e.g., h, h/2, h/4, h/8, h/16)\n"
                "3. Compute error norms at each level (L2, L-inf)\n"
                "4. Output results.json with convergence data\n"
                "5. Expected format:\n"
                '   {"convergence": {"method_name": [{"h": 0.1, "l2_error": 0.05}]}}\n'
            )
        return (
            "Physics simulation code:\n"
            "1. Define the physical system (particles, fields, etc.)\n"
            "2. Implement integrator(s)\n"
            "3. Run simulation, track observables\n"
            "4. Compare methods on the same system\n"
            "5. Output results.json with comparison data\n"
        )

    def _default_compute_budget(self) -> str:
        return (
            "Time budget for physics computations:\n"
            "- Keep simulation sizes manageable\n"
            "- Use small test systems for validation\n"
            "- Scale up only if time permits\n"
            "- Focus on accuracy, not scale"
        )

    def _default_dataset_guidance(self) -> str:
        return (
            "Physics experiments generate data programmatically:\n"
            "- Define initial conditions in code\n"
            "- Use standard test problems with known solutions\n"
            "- Do NOT download external datasets\n"
            "- Generate particle positions, velocities, or grid values in code"
        )

    def _default_hp_reporting(self) -> str:
        return (
            "Report simulation parameters:\n"
            "HYPERPARAMETERS: {'dt': ..., 'N_particles': ..., 'grid_size': ..., "
            "'num_steps': ..., 'method': ...}"
        )

    def _output_format(self, paradigm: str) -> str:
        if paradigm == "convergence":
            return (
                "Output convergence results to results.json:\n"
                '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]},\n'
                ' "metadata": {"domain": "...", "total_runtime_sec": ...}}'
            )
        return (
            "Output simulation results to results.json:\n"
            '{"conditions": {"method": {"metric_name": value}},\n'
            ' "metadata": {"domain": "...", "total_runtime_sec": ...}}'
        )


================================================
FILE: researchclaw/domains/adapters/robotics.py
================================================
"""Robotics & control domain prompt adapter.

Provides domain-specific prompt blocks for robotics experiments
(control policies, RL-based manipulation, locomotion, sim-to-real).
"""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class RoboticsPromptAdapter(PromptAdapter):
    """Adapter for robotics and control domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        libs = (
            ", ".join(domain.core_libraries)
            if domain.core_libraries
            else "gymnasium, stable-baselines3, torch"
        )

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance
            or self._default_compute_budget(),
            dataset_guidance=domain.dataset_guidance
            or self._default_dataset_guidance(),
            hp_reporting=domain.hp_reporting_guidance
            or self._default_hp_reporting(),
            code_generation_hints=domain.code_generation_hints
            or self._default_code_hints(),
            output_format_guidance=self._output_format(),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain

        design_context = (
            f"This is a **{domain.display_name}** experiment.\n"
            f"Paradigm: {domain.experiment_paradigm}\n\n"
            "Key principles for robotics experiments:\n"
            "1. Use standardized environments (Gymnasium, MuJoCo) for "
            "reproducibility\n"
            "2. Report mean ± std of episode return over multiple seeds\n"
            "3. Include learning curves (return vs. training steps)\n"
            "4. Compare against established RL baselines (PPO, SAC, TD3)\n"
            "5. Report success rate for goal-conditioned tasks\n"
        )

        if domain.standard_baselines:
            design_context += (
                f"\nStandard baselines: "
                f"{', '.join(domain.standard_baselines)}\n"
            )

        stats = (
            ", ".join(domain.statistical_tests)
            if domain.statistical_tests
            else "paired t-test"
        )

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=(
                f"Use {stats} across random seeds to assess significance. "
                "Report results over at least 5 seeds. Include confidence "
                "intervals on learning curves."
            ),
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=self.domain.result_analysis_hints
            or (
                "Robotics result analysis:\n"
                "- Episode return: mean ± std across seeds and evaluation "
                "episodes\n"
                "- Success rate: fraction of episodes reaching goal\n"
                "- Sample efficiency: return at fixed training step count\n"
                "- Learning curves: smoothed return vs. environment steps\n"
                "- Wall-clock time if comparing algorithm efficiency"
            ),
            statistical_test_guidance=(
                "Use paired t-test or Welch's t-test across seeds. "
                "Report 95% confidence intervals on all metrics."
            ),
        )

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _default_code_hints(self) -> str:
        return (
            "Robotics/control code:\n"
            "1. Create Gymnasium environment (use built-in envs or define "
            "custom wrappers)\n"
            "2. Implement or instantiate RL agent (PPO, SAC, TD3 via "
            "stable-baselines3)\n"
            "3. Train for a fixed number of environment steps\n"
            "4. Evaluate over 100 episodes, record returns\n"
            "5. Repeat across multiple seeds (>= 5)\n"
            "6. Output results.json with per-seed evaluation metrics\n"
        )

    def _default_compute_budget(self) -> str:
        return (
            "Time budget for robotics experiments:\n"
            "- Use simple envs (CartPole, Pendulum) for fast iteration\n"
            "- Limit training to 100k-500k steps for benchmarks\n"
            "- MuJoCo envs are heavier: reduce training budget\n"
            "- Evaluate over 100 episodes per seed for stable metrics"
        )

    def _default_dataset_guidance(self) -> str:
        return (
            "Robotics experiments use simulation environments:\n"
            "- Use Gymnasium built-in envs (CartPole, Pendulum, HalfCheetah)\n"
            "- Define custom environments via Gymnasium API if needed\n"
            "- Do NOT download external datasets\n"
            "- All training data is generated through environment interaction"
        )

    def _default_hp_reporting(self) -> str:
        return (
            "Report training hyperparameters:\n"
            "HYPERPARAMETERS: {'env': ..., 'algorithm': ..., 'lr': ..., "
            "'gamma': ..., 'total_timesteps': ..., 'n_seeds': ..., "
            "'eval_episodes': ...}"
        )

    def _output_format(self) -> str:
        return (
            "Output robotics experiment results to results.json:\n"
            '{"conditions": {"algorithm": {"mean_return": 195.2, '
            '"std_return": 12.3, "success_rate": 0.95}},\n'
            ' "metadata": {"domain": "robotics_control", "env": "...", '
            '"total_timesteps": ...}}'
        )


================================================
FILE: researchclaw/domains/adapters/security.py
================================================
"""Security domain prompt adapter."""

from __future__ import annotations

from typing import Any

from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks


class SecurityPromptAdapter(PromptAdapter):
    """Adapter for security/intrusion detection domains."""

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        return PromptBlocks(
            dataset_guidance=domain.dataset_guidance or (
                "Generate synthetic network/security data in code:\n"
                "- Normal traffic patterns + attack patterns\n"
                "- Class-imbalanced (realistic: ~5% attacks)\n"
                "- Do NOT download external datasets"
            ),
            code_generation_hints=domain.code_generation_hints or (
                "Security detection code:\n"
                "1. Generate synthetic tabular features (packet size, duration, etc.)\n"
                "2. Train classifiers (RF, XGBoost, SVM)\n"
                "3. Evaluate with TPR, FPR, F1, per-class metrics\n"
                "4. Report confusion matrix\n"
                "5. Output results to results.json\n"
            ),
            output_format_guidance=(
                "Output results to results.json:\n"
                '{"conditions": {"detector": {"TPR": 0.95, "FPR": 0.02, "F1": 0.93}}}'
            ),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            experiment_design_context=(
                "This is a **security/intrusion detection** experiment.\n"
                "Key: class imbalance, low false positive rate is critical.\n"
                "Compare detectors on same data splits.\n"
            ),
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks(
            result_analysis_hints=(
                "Security analysis: focus on FPR (false alarm rate) alongside TPR.\n"
                "Per-class F1 is important for multi-class attack detection."
            ),
        )


================================================
FILE: researchclaw/domains/detector.py
================================================
"""Domain detection and profile loading.

Provides :class:`DomainProfile` (the canonical representation of a research
domain's experiment conventions) and :func:`detect_domain` which maps a
research topic + context to the most appropriate profile.

Detection strategy (three-level):
  1. **Keyword matching** — fast, deterministic, hits known domains.
  2. **LLM classification** — for ambiguous topics.
  3. **Hybrid resolution** — e.g. "physics-informed neural networks"
     matches both physics and ML; we pick the primary and tag secondaries.
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)

_PROFILES_DIR = Path(__file__).parent / "profiles"


# ---------------------------------------------------------------------------
# Enums
# ---------------------------------------------------------------------------


class ExperimentParadigm(str, Enum):
    """High-level experiment structure used by different domains."""

    COMPARISON = "comparison"  # A vs B (ML, security)
    CONVERGENCE = "convergence"  # error vs refinement (math, physics)
    PROGRESSIVE_SPEC = "progressive_spec"  # OLS → +FE → +IV (economics)
    SIMULATION = "simulation"  # run → observe → analyze (physics)
    ABLATION_STUDY = "ablation_study"  # systematic component removal


class MetricType(str, Enum):
    SCALAR = "scalar"
    TABLE = "table"
    CONVERGENCE = "convergence"
    LEARNING_CURVE = "learning_curve"
    CONFUSION_MATRIX = "confusion"
    STRUCTURED = "structured"
    PARETO = "pareto"


# ---------------------------------------------------------------------------
# DomainProfile
# ---------------------------------------------------------------------------


@dataclass
class DomainProfile:
    """Complete description of a research domain's experiment conventions.

    Loaded from YAML files in ``researchclaw/domains/profiles/``.
    """

    # Identity
    domain_id: str  # e.g. "computational_physics"
    display_name: str  # e.g. "Computational Physics"
    parent_domain: str = ""  # e.g. "physics"

    # Experiment paradigm
    experiment_paradigm: str = ExperimentParadigm.COMPARISON.value
    condition_terminology: dict[str, str] = field(default_factory=lambda: {
        "baseline": "baseline",
        "proposed": "proposed method",
        "variant": "ablation",
        "input": "dataset",
        "metric": "accuracy/loss",
    })

    # Code structure
    typical_file_structure: dict[str, str] = field(default_factory=dict)
    entry_point: str = "main.py"

    # Dependencies & environment
    core_libraries: list[str] = field(default_factory=list)
    docker_image: str = "researchclaw/sandbox-generic:latest"
    gpu_required: bool = False
    pip_packages: list[str] = field(default_factory=list)

    # Metrics & evaluation
    metric_types: list[str] = field(default_factory=lambda: ["scalar"])
    standard_baselines: list[str] = field(default_factory=list)
    evaluation_protocol: str = ""
    statistical_tests: list[str] = field(default_factory=lambda: ["paired_t_test"])

    # Output & presentation
    output_formats: list[str] = field(default_factory=lambda: ["latex_table"])
    figure_types: list[str] = field(default_factory=lambda: ["bar_chart", "line_plot"])

    # Search keywords (for Code Searcher and literature)
    github_search_terms: list[str] = field(default_factory=list)
    paper_keywords: list[str] = field(default_factory=list)

    # Prompt guidance blocks (domain-specific instruction text)
    compute_budget_guidance: str = ""
    dataset_guidance: str = ""
    hp_reporting_guidance: str = ""
    code_generation_hints: str = ""
    result_analysis_hints: str = ""


# ---------------------------------------------------------------------------
# Profile loading
# ---------------------------------------------------------------------------

_profile_cache: dict[str, DomainProfile] = {}


def _load_profile(path: Path) -> DomainProfile:
    """Load a single YAML profile into a DomainProfile."""
    with path.open(encoding="utf-8") as fh:
        data: dict[str, Any] = yaml.safe_load(fh) or {}

    return DomainProfile(
        domain_id=data.get("domain_id", path.stem),
        display_name=data.get("display_name", path.stem.replace("_", " ").title()),
        parent_domain=data.get("parent_domain", ""),
        experiment_paradigm=data.get("experiment_paradigm", "comparison"),
        condition_terminology=data.get("condition_terminology", {}),
        typical_file_structure=data.get("typical_file_structure", {}),
        entry_point=data.get("entry_point", "main.py"),
        core_libraries=data.get("core_libraries", []),
        docker_image=data.get("docker_image", "researchclaw/sandbox-generic:latest"),
        gpu_required=data.get("gpu_required", False),
        pip_packages=data.get("pip_packages", []),
        metric_types=data.get("metric_types", ["scalar"]),
        standard_baselines=data.get("standard_baselines", []),
        evaluation_protocol=data.get("evaluation_protocol", ""),
        statistical_tests=data.get("statistical_tests", ["paired_t_test"]),
        output_formats=data.get("output_formats", ["latex_table"]),
        figure_types=data.get("figure_types", ["bar_chart", "line_plot"]),
        github_search_terms=data.get("github_search_terms", []),
        paper_keywords=data.get("paper_keywords", []),
        compute_budget_guidance=data.get("compute_budget_guidance", ""),
        dataset_guidance=data.get("dataset_guidance", ""),
        hp_reporting_guidance=data.get("hp_reporting_guidance", ""),
        code_generation_hints=data.get("code_generation_hints", ""),
        result_analysis_hints=data.get("result_analysis_hints", ""),
    )


def load_all_profiles() -> dict[str, DomainProfile]:
    """Load all YAML profiles from the profiles directory."""
    global _profile_cache
    if _profile_cache:
        return _profile_cache

    if not _PROFILES_DIR.is_dir():
        logger.warning("Profiles directory not found: %s", _PROFILES_DIR)
        return {}

    for yaml_path in sorted(_PROFILES_DIR.glob("*.yaml")):
        try:
            profile = _load_profile(yaml_path)
            _profile_cache[profile.domain_id] = profile
        except Exception:
            logger.warning("Failed to load profile %s", yaml_path, exc_info=True)

    logger.info("Loaded %d domain profiles", len(_profile_cache))
    return _profile_cache


def get_profile(domain_id: str) -> DomainProfile | None:
    """Get a specific domain profile by ID."""
    profiles = load_all_profiles()
    return profiles.get(domain_id)


def get_generic_profile() -> DomainProfile:
    """Return the generic fallback profile."""
    profile = get_profile("generic")
    if profile is not None:
        return profile
    # Hardcoded fallback if YAML not found
    return DomainProfile(
        domain_id="generic",
        display_name="Generic Computational Research",
        experiment_paradigm="comparison",
        core_libraries=["numpy", "scipy", "matplotlib", "pandas"],
        docker_image="researchclaw/sandbox-generic:latest",
    )


# ---------------------------------------------------------------------------
# Keyword-based detection rules
# ---------------------------------------------------------------------------

# Ordered list: first match wins (more specific patterns first).
_KEYWORD_RULES: list[tuple[list[str], str]] = [
    # ML sub-domains (most specific first)
    (["reinforcement learning", "rl agent", "policy gradient", "q-learning",
      "actor-critic", "reward shaping", "gymnasium", "stable-baselines"],
     "ml_rl"),
    (["knowledge distillation", "teacher-student", "model compression",
      "pruning", "quantization"], "ml_compression"),
    (["natural language", "nlp", "text classification", "sentiment",
      "language model", "transformer", "bert", "gpt", "llm", "tokeniz"],
     "ml_nlp"),
    (["object detection", "image segmentation", "image classification",
      "convolutional", "cnn", "resnet", "vit", "vision transformer",
      "computer vision", "visual"], "ml_vision"),
    (["graph neural", "gnn", "node classification", "link prediction",
      "graph convolution", "message passing"], "ml_graph"),
    (["tabular", "xgboost", "lightgbm", "catboost", "feature engineering"],
     "ml_tabular"),
    (["generative adversarial", "gan", "diffusion model", "vae",
      "variational autoencoder", "image generation"], "ml_generative"),
    # Neuroscience (before ML catch-all so "spiking neural" is not swallowed
    # by the "neural network" pattern in ml_generic)
    (["spiking neural", "spike train", "brian2", "hodgkin-huxley",
      "integrate-and-fire", "lif model", "izhikevich",
      "membrane potential", "action potential", "neural circuit",
      "neural dynamics", "population coding", "neural decoding",
      "raster plot", "firing rate", "synaptic", "connectome"],
     "neuroscience_computational"),
    (["fmri", "eeg", "meg", "neuroimaging", "brain imaging",
      "nilearn", "mne-python", "bold signal", "brain network",
      "functional connectivity"], "neuroscience_imaging"),
    (["neuroscience", "neuron model", "brain simulation",
      "neural computation", "neural encoding"], "neuroscience_computational"),

    # Catch-all ML
    (["neural network", "deep learning", "machine learning", "training loop",
      "backpropagation", "gradient descent", "pytorch", "tensorflow",
      "torch", "sklearn"], "ml_generic"),

    # Physics
    (["molecular dynamics", "n-body", "lennard-jones", "force field",
      "jax-md", "ase", "openmm"], "physics_simulation"),
    (["partial differential", "pde", "finite element", "finite difference",
      "fenics", "navier-stokes", "heat equation", "wave equation",
      "poisson", "laplace"], "physics_pde"),
    (["quantum mechanics", "schrodinger", "hamiltonian", "wavefunction",
      "density functional"], "physics_quantum"),
    (["physics", "simulation", "integrator", "conservation",
      "energy drift", "symplectic"], "physics_simulation"),

    # Chemistry
    (["quantum chemistry", "dft", "hartree-fock", "pyscf", "ccsd",
      "molecular orbital", "basis set"], "chemistry_qm"),
    (["molecular property", "smiles", "rdkit", "fingerprint", "drug",
      "binding affinity", "admet"], "chemistry_molprop"),
    (["chemistry", "molecule", "reaction", "catalyst"], "chemistry_general"),

    # Biology
    (["single-cell", "scrna", "scanpy", "anndata", "leiden",
      "differential expression", "pseudotime"], "biology_singlecell"),
    (["genomics", "genome", "variant calling", "sequencing",
      "biopython", "alignment"], "biology_genomics"),
    (["protein", "alphafold", "protein folding", "amino acid",
      "esm"], "biology_protein"),
    (["biology", "bioinformatics", "omics"], "biology_general"),

    # Economics
    (["econometrics", "regression", "instrumental variable",
      "fixed effect", "panel data", "difference-in-difference",
      "causal inference", "statsmodels", "linearmodels"],
     "economics_empirical"),
    (["economics", "economic", "market", "equilibrium",
      "utility", "welfare"], "economics_general"),

    # Mathematics
    (["numerical method", "numerical analysis", "convergence order",
      "finite difference", "quadrature", "interpolation",
      "ode solver", "runge-kutta", "sympy"], "mathematics_numerical"),
    (["optimization", "convex", "linear programming",
      "gradient-free", "evolutionary algorithm"], "mathematics_optimization"),
    (["mathematics", "mathematical", "theorem", "proof",
      "algebra", "topology"], "mathematics_general"),

    # Security
    (["intrusion detection", "malware", "anomaly detection",
      "network traffic", "cybersecurity", "vulnerability",
      "threat detection", "scapy"], "security_detection"),

    # Robotics / Control
    (["robot", "robotic", "control", "manipulation",
      "mujoco", "pybullet", "locomotion", "navigation"],
     "robotics_control"),
]


def _keyword_detect(text: str) -> str | None:
    """Match text against keyword rules. Returns domain_id or None."""
    lower = text.lower()
    for keywords, domain_id in _KEYWORD_RULES:
        for kw in keywords:
            if kw in lower:
                return domain_id
    return None


# ---------------------------------------------------------------------------
# LLM-based detection
# ---------------------------------------------------------------------------

_LLM_CLASSIFY_PROMPT = """\
You are a domain classifier for computational research topics.
Given the research topic and context, classify it into EXACTLY ONE domain.

Available domains:
- ml_vision: Computer vision (image classification, detection, segmentation)
- ml_nlp: Natural language processing (text, language models, transformers)
- ml_rl: Reinforcement learning (agents, environments, rewards)
- ml_graph: Graph neural networks (node/edge/graph tasks)
- ml_tabular: Tabular ML (XGBoost, feature engineering)
- ml_generative: Generative models (GANs, diffusion, VAE)
- ml_compression: Model compression (distillation, pruning, quantization)
- ml_generic: Other ML/AI research
- physics_simulation: Molecular dynamics, N-body, classical simulations
- physics_pde: PDE solvers (FEM, FDM, spectral methods)
- physics_quantum: Quantum mechanics, quantum chemistry
- chemistry_qm: Quantum chemistry (DFT, Hartree-Fock, PySCF)
- chemistry_molprop: Molecular property prediction (SMILES, RDKit)
- biology_singlecell: Single-cell analysis (scRNA-seq, scanpy)
- biology_genomics: Genomics (sequencing, variant calling)
- biology_protein: Protein science (folding, property prediction)
- economics_empirical: Empirical economics (regression, causal inference)
- mathematics_numerical: Numerical methods (ODE/PDE solvers, convergence)
- mathematics_optimization: Optimization (convex, evolutionary)
- security_detection: Security/intrusion detection
- neuroscience_computational: Computational neuroscience (spiking networks, neural dynamics, population coding)
- neuroscience_imaging: Brain imaging analysis (fMRI, EEG, MEG, functional connectivity)
- robotics_control: Robotics and control
- generic: Cannot classify / cross-domain

Topic: {topic}
Context: {context}

Respond with ONLY the domain_id (e.g., "ml_vision"). Nothing else."""


def _llm_detect(
    topic: str, context: str, llm: Any,
) -> str | None:
    """Use LLM to classify a research topic into a domain.

    Synchronous — ``llm.chat()`` is a blocking call.
    """
    try:
        prompt = _LLM_CLASSIFY_PROMPT.format(topic=topic, context=context)
        response = llm.chat(
            [{"role": "user", "content": prompt}],
            system="You are a precise domain classifier.",
            max_tokens=50,
        )
        content = getattr(response, "content", None)
        if not content or not content.strip():
            logger.warning("LLM domain detection returned empty response")
            return None
        domain_id = content.strip().strip('"').strip("'").lower()
        # Validate it's a known domain
        profiles = load_all_profiles()
        if domain_id in profiles or domain_id == "generic":
            return domain_id
        # Try fuzzy match (require at least 4 chars to avoid over-matching)
        if len(domain_id) >= 4:
            for pid in profiles:
                if pid in domain_id or domain_id in pid:
                    return pid
        logger.warning("LLM returned unknown domain: %s", domain_id)
        return None
    except Exception:
        logger.warning("LLM domain detection failed", exc_info=True)
        return None


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def detect_domain(
    topic: str,
    hypotheses: str = "",
    literature: str = "",
    llm: Any | None = None,
) -> DomainProfile:
    """Detect the research domain from topic and context.

    Three-level detection:
    1. Keyword matching (fast, deterministic)
    2. LLM classification (if llm is provided, for ambiguous topics)
    3. Fallback to generic profile

    Parameters
    ----------
    topic : str
        Research topic description.
    hypotheses : str
        Hypotheses text for additional context.
    literature : str
        Literature review text for additional context.
    llm : LLMClient, optional
        LLM client for classification fallback.

    Returns
    -------
    DomainProfile
        The detected domain profile.
    """
    combined_text = f"{topic} {hypotheses} {literature}"

    # Level 1: Keyword matching
    domain_id = _keyword_detect(combined_text)
    if domain_id:
        profile = get_profile(domain_id)
        if profile:
            logger.info(
                "Domain detected via keywords: %s (%s)",
                profile.display_name, domain_id,
            )
            return profile
        logger.warning(
            "Keyword matched domain_id=%s but no profile found, falling back",
            domain_id,
        )

    # Level 2: LLM classification
    if llm is not None:
        domain_id = _llm_detect(combined_text, f"hypotheses: {hypotheses}", llm)
        if domain_id:
            profile = get_profile(domain_id)
            if profile:
                logger.info(
                    "Domain detected via LLM: %s (%s)",
                    profile.display_name, domain_id,
                )
                return profile

    # Level 3: Fallback to generic
    logger.info("Using generic domain profile for topic: %.80s", topic)
    return get_generic_profile()


async def detect_domain_async(
    topic: str,
    hypotheses: str = "",
    literature: str = "",
    llm: Any | None = None,
) -> DomainProfile:
    """Async version of detect_domain with LLM classification support."""
    combined_text = f"{topic} {hypotheses} {literature}"

    # Level 1: Keyword matching
    domain_id = _keyword_detect(combined_text)
    if domain_id:
        profile = get_profile(domain_id)
        if profile:
            logger.info(
                "Domain detected via keywords: %s (%s)",
                profile.display_name, domain_id,
            )
            return profile

    # Level 2: LLM classification
    if llm is not None:
        domain_id = _llm_detect(topic, combined_text, llm)
        if domain_id:
            profile = get_profile(domain_id)
            if profile:
                logger.info(
                    "Domain detected via LLM: %s (%s)",
                    profile.display_name, domain_id,
                )
                return profile

    # Level 3: Fallback
    logger.info("Using generic domain profile for topic: %.80s", topic)
    return get_generic_profile()


def detect_domain_id(topic: str, hypotheses: str = "", literature: str = "") -> str:
    """Quick keyword-only detection that returns a domain_id string.

    Useful for lightweight checks where a full profile isn't needed.
    """
    combined = f"{topic} {hypotheses} {literature}"
    return _keyword_detect(combined) or "generic"


def is_ml_domain(domain: DomainProfile) -> bool:
    """Check if a domain profile represents an ML/AI domain."""
    return domain.domain_id.startswith("ml_") or domain.domain_id in (
        "ml_generic", "ml_vision", "ml_nlp", "ml_rl", "ml_graph",
        "ml_tabular", "ml_generative", "ml_compression",
    )


================================================
FILE: researchclaw/domains/experiment_schema.py
================================================
"""Universal experiment schema — domain-agnostic experiment plan structure.

Replaces the fixed ``baselines/proposed_methods/ablations`` keys with a
generic ``conditions`` list that uses role-based terminology, adaptable
to any research domain.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Any

import yaml


class ConditionRole(str, Enum):
    """Role of an experimental condition."""
    REFERENCE = "reference"  # baseline / reference solver / standard pipeline
    PROPOSED = "proposed"  # the method being investigated
    VARIANT = "variant"  # ablation / parameter variation / robustness check


class ExperimentType(str, Enum):
    COMPARISON = "comparison"
    CONVERGENCE = "convergence"
    PROGRESSIVE_SPEC = "progressive_spec"
    SIMULATION = "simulation"
    ABLATION_STUDY = "ablation_study"


@dataclass
class Condition:
    """A single experimental condition (method, configuration, etc.)."""
    name: str
    role: str = ConditionRole.PROPOSED.value
    description: str = ""
    varies_from: str = ""  # parent condition for variants
    variation: str = ""  # what is varied
    parameters: dict[str, Any] = field(default_factory=dict)


@dataclass
class MetricSpec:
    """Specification of a metric to evaluate."""
    name: str
    direction: str = "minimize"  # "minimize" | "maximize"
    unit: str = ""
    description: str = ""


@dataclass
class EvaluationSpec:
    """Evaluation protocol for the experiment."""
    primary_metric: MetricSpec = field(default_factory=lambda: MetricSpec(name="primary_metric"))
    secondary_metrics: list[MetricSpec] = field(default_factory=list)
    protocol: str = ""
    statistical_test: str = "paired_t_test"
    num_seeds: int = 3


@dataclass
class UniversalExperimentPlan:
    """Domain-agnostic experiment plan.

    This can represent ML train-eval, physics convergence studies,
    economics regression tables, and any other paradigm.
    """

    experiment_type: str = ExperimentType.COMPARISON.value
    domain_id: str = ""
    problem_description: str = ""

    # Conditions (replaces baselines / proposed_methods / ablations)
    conditions: list[Condition] = field(default_factory=list)

    # Inputs
    input_type: str = "generated"  # "benchmark_dataset" | "generated" | "loaded"
    input_description: str = ""

    # Evaluation
    evaluation: EvaluationSpec = field(default_factory=EvaluationSpec)

    # Presentation
    main_figure_type: str = "bar_chart"
    main_table_type: str = "comparison_table"

    # Raw YAML (for backward compatibility with existing pipeline)
    raw_yaml: str = ""

    @property
    def references(self) -> list[Condition]:
        """Get conditions with 'reference' role (baselines)."""
        return [c for c in self.conditions if c.role == ConditionRole.REFERENCE.value]

    @property
    def proposed(self) -> list[Condition]:
        """Get conditions with 'proposed' role."""
        return [c for c in self.conditions if c.role == ConditionRole.PROPOSED.value]

    @property
    def variants(self) -> list[Condition]:
        """Get conditions with 'variant' role (ablations)."""
        return [c for c in self.conditions if c.role == ConditionRole.VARIANT.value]

    def to_legacy_format(self) -> dict[str, Any]:
        """Convert to legacy baselines/proposed_methods/ablations format.

        This allows the universal plan to be consumed by existing pipeline
        code that expects the old key names.
        """
        baselines = [
            {"name": c.name, "description": c.description}
            for c in self.references
        ]
        proposed = [
            {"name": c.name, "description": c.description}
            for c in self.proposed
        ]
        ablations = [
            {
                "name": c.name,
                "description": c.description,
                "varies_from": c.varies_from,
                "variation": c.variation,
            }
            for c in self.variants
        ]

        return {
            "baselines": baselines,
            "proposed_methods": proposed,
            "ablations": ablations,
            "metrics": {
                self.evaluation.primary_metric.name: {
                    "direction": self.evaluation.primary_metric.direction,
                }
            },
        }

    def to_yaml(self) -> str:
        """Serialize to YAML string."""
        data: dict[str, Any] = {
            "experiment": {
                "type": self.experiment_type,
                "domain": self.domain_id,
                "problem": {"description": self.problem_description},
                "conditions": [
                    {
                        "name": c.name,
                        "role": c.role,
                        "description": c.description,
                        **({"varies_from": c.varies_from} if c.varies_from else {}),
                        **({"variation": c.variation} if c.variation else {}),
                    }
                    for c in self.conditions
                ],
                "inputs": {
                    "type": self.input_type,
                    "description": self.input_description,
                },
                "evaluation": {
                    "primary_metric": {
                        "name": self.evaluation.primary_metric.name,
                        "direction": self.evaluation.primary_metric.direction,
                    },
                    "protocol": self.evaluation.protocol,
                    "statistical_test": self.evaluation.statistical_test,
                },
                "presentation": {
                    "main_figure": self.main_figure_type,
                    "main_table": self.main_table_type,
                },
            }
        }
        return yaml.dump(data, default_flow_style=False, sort_keys=False)


def from_legacy_exp_plan(
    plan_yaml: str | dict[str, Any],
    domain_id: str = "",
) -> UniversalExperimentPlan:
    """Convert a legacy exp_plan.yaml (baselines/proposed/ablations) to
    the universal format.

    This allows existing ML experiment plans to work with the new system.
    """
    if isinstance(plan_yaml, str):
        data = yaml.safe_load(plan_yaml) or {}
    else:
        data = plan_yaml

    conditions: list[Condition] = []

    # Parse baselines → reference
    for b in data.get("baselines", []):
        if isinstance(b, str):
            conditions.append(Condition(name=b, role=ConditionRole.REFERENCE.value))
        elif isinstance(b, dict):
            conditions.append(Condition(
                name=b.get("name", "baseline"),
                role=ConditionRole.REFERENCE.value,
                description=b.get("description", ""),
            ))

    # Parse proposed_methods → proposed
    for p in data.get("proposed_methods", []):
        if isinstance(p, str):
            conditions.append(Condition(name=p, role=ConditionRole.PROPOSED.value))
        elif isinstance(p, dict):
            conditions.append(Condition(
                name=p.get("name", "proposed"),
                role=ConditionRole.PROPOSED.value,
                description=p.get("description", ""),
            ))

    # Parse ablations → variant
    for a in data.get("ablations", []):
        if isinstance(a, str):
            conditions.append(Condition(name=a, role=ConditionRole.VARIANT.value))
        elif isinstance(a, dict):
            conditions.append(Condition(
                name=a.get("name", "ablation"),
                role=ConditionRole.VARIANT.value,
                description=a.get("description", ""),
                varies_from=a.get("varies_from", ""),
                variation=a.get("variation", ""),
            ))

    # Parse metrics
    metrics = data.get("metrics", {})
    primary_name = "primary_metric"
    primary_direction = "minimize"
    if isinstance(metrics, dict):
        for name, spec in metrics.items():
            primary_name = name
            if isinstance(spec, dict):
                primary_direction = spec.get("direction", "minimize")
            break
    elif isinstance(metrics, list) and metrics:
        primary_name = metrics[0] if isinstance(metrics[0], str) else "primary_metric"

    return UniversalExperimentPlan(
        experiment_type=data.get("experiment_type", "comparison"),
        domain_id=domain_id,
        problem_description=data.get("objective", ""),
        conditions=conditions,
        evaluation=EvaluationSpec(
            primary_metric=MetricSpec(name=primary_name, direction=primary_direction),
        ),
        raw_yaml=yaml.dump(data, default_flow_style=False) if isinstance(data, dict) else str(plan_yaml),
    )


================================================
FILE: researchclaw/domains/profiles/_generic.yaml
================================================
domain_id: generic
display_name: Generic Computational Research
parent_domain: ""

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: variant
  input: input data
  metric: primary metric

typical_file_structure:
  config.py: "Experiment configuration and parameters"
  data.py: "Data generation or loading"
  methods.py: "Method implementations"
  evaluate.py: "Evaluation and metrics"
  main.py: "Entry point: setup → run → evaluate → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - matplotlib
  - pandas

docker_image: researchclaw/sandbox-generic:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - matplotlib
  - pandas
  - scikit-learn

metric_types:
  - scalar
  - structured

evaluation_protocol: "Compare methods on standard test problems. Report primary metric with error bars."
statistical_tests:
  - paired_t_test

github_search_terms:
  - scientific computing python
  - computational experiment

paper_keywords:
  - computational methods

code_generation_hints: |
  General guidelines for any computational experiment:
  1. Implement all methods from the experiment plan
  2. Use the same test problems / data for all methods
  3. Report results as JSON to results.json
  4. Include error bars / multiple seeds where applicable
  5. Follow good software practices: clear variable names, modular code


================================================
FILE: researchclaw/domains/profiles/biology_genomics.yaml
================================================
domain_id: biology_genomics
display_name: Genomics Analysis
parent_domain: biology
experiment_paradigm: comparison
condition_terminology:
  baseline: standard pipeline
  proposed: proposed method
  variant: parameter variant
  input: sequence data
  metric: accuracy / F1
typical_file_structure:
  config.py: "Analysis parameters"
  data.py: "Sequence data loading"
  methods.py: "Analysis methods"
  main.py: "Entry point"
entry_point: main.py
core_libraries: [biopython, numpy, scikit-learn, pandas]
docker_image: researchclaw/sandbox-biology:latest
gpu_required: false
pip_packages: [biopython, numpy, scikit-learn, pandas, matplotlib]
metric_types: [scalar]
standard_baselines: [BLAST, BWA, GATK]
evaluation_protocol: "Compare methods on standard genomic tasks."
statistical_tests: [paired_t_test]
github_search_terms: [genomics python, sequence analysis biopython]
paper_keywords: [genomics, sequence analysis]


================================================
FILE: researchclaw/domains/profiles/biology_protein.yaml
================================================
domain_id: biology_protein
display_name: Protein Science
parent_domain: biology
experiment_paradigm: comparison
condition_terminology:
  baseline: baseline model
  proposed: proposed model
  variant: ablation
  input: protein data
  metric: RMSD / accuracy
typical_file_structure:
  config.py: "Model parameters"
  data.py: "Protein data loading"
  model.py: "Prediction model"
  main.py: "Entry point"
entry_point: main.py
core_libraries: [numpy, torch, scikit-learn, biopython]
docker_image: researchclaw/sandbox-biology:latest
gpu_required: false
pip_packages: [numpy, torch, scikit-learn, biopython, pandas]
metric_types: [scalar]
standard_baselines: [ESM-2, ProtBERT, Random Forest]
evaluation_protocol: "Evaluate on standard protein benchmarks."
statistical_tests: [paired_t_test]
github_search_terms: [protein prediction pytorch, protein folding python]
paper_keywords: [protein structure prediction]


================================================
FILE: researchclaw/domains/profiles/biology_singlecell.yaml
================================================
domain_id: biology_singlecell
display_name: Single-Cell Analysis
parent_domain: biology

experiment_paradigm: comparison
condition_terminology:
  baseline: standard pipeline
  proposed: proposed method
  variant: sensitivity analysis
  input: dataset
  metric: ARI / NMI

typical_file_structure:
  config.py: "Analysis parameters"
  data.py: "Data loading and preprocessing (QC, normalization, HVG)"
  methods.py: "Clustering/analysis method implementations"
  evaluation.py: "Evaluation metrics (ARI, NMI, silhouette)"
  main.py: "Entry point: load → preprocess → cluster → evaluate → report"

entry_point: main.py

core_libraries:
  - scanpy
  - anndata
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-biology:latest
gpu_required: false

pip_packages:
  - scanpy
  - anndata
  - leidenalg
  - numpy
  - scikit-learn
  - matplotlib
  - pandas

metric_types:
  - scalar
  - table

standard_baselines:
  - Leiden clustering
  - Louvain clustering
  - K-means
  - Spectral clustering

evaluation_protocol: "Apply clustering methods, evaluate against known cell types. Report ARI, NMI, silhouette score."
statistical_tests:
  - wilcoxon_rank_sum
  - fdr_correction

github_search_terms:
  - scanpy tutorial
  - single cell clustering
  - scRNA-seq analysis python

paper_keywords:
  - single-cell RNA-seq
  - cell clustering
  - differential expression

dataset_guidance: |
  Single-cell analysis datasets:
  - Generate synthetic data with scanpy.datasets or simulate with splatter
  - Use small reference datasets: PBMC3k, Paul15
  - Format: AnnData (h5ad)
  - Preprocessing: QC → normalize → log1p → HVG selection → PCA → neighbors


================================================
FILE: researchclaw/domains/profiles/chemistry_molprop.yaml
================================================
domain_id: chemistry_molprop
display_name: Molecular Property Prediction
parent_domain: chemistry

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline model
  proposed: proposed model
  variant: ablation
  input: molecular dataset
  metric: MAE / RMSE

typical_file_structure:
  config.py: "Model and training configuration"
  data.py: "Molecular data loading and featurization"
  model.py: "Property prediction model"
  train.py: "Training loop"
  main.py: "Entry point: load data → featurize → train → evaluate → report"

entry_point: main.py

core_libraries:
  - rdkit
  - numpy
  - scikit-learn
  - torch

docker_image: researchclaw/sandbox-chemistry:latest
gpu_required: false

pip_packages:
  - rdkit
  - numpy
  - scikit-learn
  - torch
  - pandas
  - matplotlib

metric_types:
  - scalar

standard_baselines:
  - Random Forest + Morgan FP
  - XGBoost + ECFP
  - GCN
  - SchNet
  - MPNN

evaluation_protocol: "Train/test split or cross-validation. Report MAE, RMSE on test set."
statistical_tests:
  - paired_t_test

github_search_terms:
  - molecular property prediction
  - rdkit fingerprint model
  - cheminformatics python
  - SMILES prediction

paper_keywords:
  - molecular property prediction
  - cheminformatics
  - QSAR

dataset_guidance: |
  Use standard molecular datasets:
  - Generate small datasets from SMILES strings in code
  - Use RDKit for featurization (Morgan fingerprints, descriptors)
  - Standard datasets: QM9, ESOL, FreeSolv, Lipophilicity
  - For benchmarking, use simple built-in data or generated molecular data


================================================
FILE: researchclaw/domains/profiles/chemistry_qm.yaml
================================================
domain_id: chemistry_qm
display_name: Quantum Chemistry
parent_domain: chemistry

experiment_paradigm: comparison
condition_terminology:
  baseline: "reference method (e.g., CCSD(T))"
  proposed: proposed method
  variant: basis set / parameter variant
  input: molecular system
  metric: energy error (kcal/mol)

typical_file_structure:
  config.py: "Molecule definitions and calculation parameters"
  molecules.py: "Molecular geometry and basis set setup"
  calculations.py: "Quantum chemistry calculation routines"
  analysis.py: "Error analysis and comparison with reference"
  main.py: "Entry point: define molecules → calculate → compare → report"

entry_point: main.py

core_libraries:
  - pyscf
  - numpy
  - scipy

docker_image: researchclaw/sandbox-chemistry:latest
gpu_required: false

pip_packages:
  - pyscf
  - numpy
  - scipy
  - matplotlib
  - pandas

metric_types:
  - scalar
  - table

standard_baselines:
  - HF (Hartree-Fock)
  - DFT/B3LYP
  - MP2
  - CCSD
  - CCSD(T)

evaluation_protocol: "Calculate molecular properties with different methods, compare against high-level reference (CCSD(T) or experimental)."
statistical_tests:
  - mae
  - rmse
  - max_error

output_formats:
  - latex_table

figure_types:
  - bar_chart
  - correlation_plot
  - potential_energy_surface

github_search_terms:
  - pyscf example
  - quantum chemistry python
  - DFT calculation tutorial
  - hartree fock python

paper_keywords:
  - quantum chemistry
  - density functional theory
  - electronic structure

dataset_guidance: |
  Quantum chemistry uses molecular systems defined in code:
  - Define molecular geometries using atomic coordinates
  - Use standard basis sets (STO-3G for testing, cc-pVDZ/cc-pVTZ for production)
  - Standard test molecules: H2, H2O, CH4, benzene
  - Do NOT download external datasets

code_generation_hints: |
  Quantum chemistry code with PySCF:
  1. Define molecules: mol = gto.M(atom='H 0 0 0; H 0 0 0.74', basis='sto-3g')
  2. Run calculations: mf = scf.RHF(mol); mf.kernel()
  3. Compare multiple methods on the same molecule set
  4. Report energies in Hartree and errors in kcal/mol (1 Ha = 627.509 kcal/mol)
  5. Output results.json with method comparison data


================================================
FILE: researchclaw/domains/profiles/economics_empirical.yaml
================================================
domain_id: economics_empirical
display_name: Empirical Economics
parent_domain: economics

experiment_paradigm: progressive_spec
condition_terminology:
  baseline: "specification (1)"
  proposed: full specification
  variant: robustness check
  input: sample
  metric: coefficient estimate

typical_file_structure:
  config.py: "Specification definitions and variable lists"
  data_prep.py: "Data cleaning, variable construction, panel setup"
  models.py: "Regression specifications (OLS, FE, IV)"
  robustness.py: "Robustness checks and sensitivity analysis"
  main.py: "Entry point: load → estimate → robustness → tables"

entry_point: main.py

core_libraries:
  - statsmodels
  - linearmodels
  - pandas
  - numpy
  - scipy

docker_image: researchclaw/sandbox-economics:latest
gpu_required: false

pip_packages:
  - statsmodels
  - linearmodels
  - pandas
  - numpy
  - scipy
  - matplotlib

metric_types:
  - table
  - scalar

standard_baselines:
  - OLS
  - OLS + controls
  - Fixed Effects
  - 2SLS / IV

evaluation_protocol: "Progressive specification: OLS → +controls → +FE → +IV. Report regression table with coefficients, SE, R², N."
statistical_tests:
  - hausman_test
  - robust_se
  - cluster_se
  - f_test

output_formats:
  - regression_table
  - latex_table

figure_types:
  - coefficient_plot
  - scatter_plot
  - residual_plot

github_search_terms:
  - statsmodels regression example
  - panel data python
  - instrumental variable python
  - econometrics python

paper_keywords:
  - causal inference
  - instrumental variables
  - panel data
  - regression analysis

compute_budget_guidance: |
  Economics regressions are fast. Focus on:
  - Multiple specifications (4-6 columns in regression table)
  - Bootstrap for standard errors if needed (100-500 replications)
  - Cluster-robust SE for panel data

dataset_guidance: |
  Economics data:
  - Generate synthetic panel/cross-section data in code
  - Simulate treatment effects with known DGP
  - Include controls, fixed effects structure
  - Standard test: wage equation, returns to education
  - Do NOT download external datasets

code_generation_hints: |
  Economics regression code:
  1. Generate synthetic data with known treatment effect
  2. Implement progressive specifications (OLS → +controls → +FE → +IV)
  3. Report coefficient, SE, p-value, R², N for each spec
  4. Use robust/clustered standard errors
  5. Output regression table to results.json:
     {"regression_table": {"spec_1_ols": {"coeff": 0.15, "se": 0.03, ...}}}


================================================
FILE: researchclaw/domains/profiles/mathematics_numerical.yaml
================================================
domain_id: mathematics_numerical
display_name: Numerical Mathematics
parent_domain: mathematics

experiment_paradigm: convergence
condition_terminology:
  baseline: reference method
  proposed: proposed method
  variant: variant
  input: test problem
  metric: error / convergence order

typical_file_structure:
  config.py: "Problem parameters and method configurations"
  problems.py: "Test problem definitions with exact solutions"
  methods.py: "Numerical method implementations"
  analysis.py: "Error analysis and convergence computation"
  main.py: "Entry point: define problems → solve → analyze → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - sympy
  - matplotlib

docker_image: researchclaw/sandbox-math:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - sympy
  - matplotlib

metric_types:
  - scalar
  - convergence

standard_baselines:
  - Euler method
  - RK4
  - Adams-Bashforth
  - Crank-Nicolson
  - Gauss-Legendre quadrature

evaluation_protocol: "Run method at multiple step sizes, compute error against reference solution. Report convergence order."
statistical_tests:
  - convergence_order_fit

output_formats:
  - latex_table
  - convergence_plot

figure_types:
  - convergence_plot
  - error_plot
  - solution_comparison_plot

github_search_terms:
  - numerical methods python
  - ODE solver convergence
  - numerical analysis numpy
  - quadrature python

paper_keywords:
  - numerical methods
  - convergence analysis
  - error estimates

compute_budget_guidance: |
  Numerical methods are typically fast. Focus on accuracy:
  - Use 5-8 refinement levels for clear convergence plots
  - Step sizes: geometric sequence (e.g., h, h/2, h/4, h/8, ...)

dataset_guidance: |
  Numerical methods use standard test problems:
  - ODE: Lotka-Volterra, Van der Pol, stiff problems
  - Quadrature: polynomials, oscillatory integrands, singular integrands
  - Linear algebra: Hilbert matrix, sparse tridiagonal systems
  - Define all problems in code; do NOT download external data

code_generation_hints: |
  Numerical methods code:
  1. Implement methods from scratch (not just calling scipy)
  2. Use test problems with KNOWN exact solutions
  3. Compute error at multiple refinement levels
  4. Report convergence order: p = log(e_h / e_{h/2}) / log(2)
  5. Output results.json with convergence data


================================================
FILE: researchclaw/domains/profiles/mathematics_optimization.yaml
================================================
domain_id: mathematics_optimization
display_name: Mathematical Optimization
parent_domain: mathematics
experiment_paradigm: comparison
condition_terminology:
  baseline: reference optimizer
  proposed: proposed optimizer
  variant: parameter variant
  input: test function
  metric: convergence rate
typical_file_structure:
  config.py: "Optimization parameters"
  problems.py: "Test function definitions"
  optimizers.py: "Optimizer implementations"
  main.py: "Entry point"
entry_point: main.py
core_libraries: [numpy, scipy, matplotlib]
docker_image: researchclaw/sandbox-math:latest
gpu_required: false
pip_packages: [numpy, scipy, matplotlib]
metric_types: [scalar, convergence]
standard_baselines: [Gradient Descent, L-BFGS, Nelder-Mead, Adam, CMA-ES]
evaluation_protocol: "Run optimizers on standard test functions."
statistical_tests: [paired_t_test, convergence_order_fit]
github_search_terms: [optimization benchmark python, convex optimization scipy]
paper_keywords: [optimization, convergence analysis]


================================================
FILE: researchclaw/domains/profiles/ml_compression.yaml
================================================
domain_id: ml_compression
display_name: Model Compression (ML)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: uncompressed model
  proposed: compressed model
  variant: ablation
  input: dataset
  metric: accuracy / compression ratio

typical_file_structure:
  config.py: "Hyperparameters and compression configuration"
  data.py: "Dataset loading with train/val/test splits"
  model.py: "Teacher and student model architecture definitions"
  compress.py: "Compression methods (distillation, pruning, quantization)"
  main.py: "Entry point: setup -> compress -> evaluate -> report"

entry_point: main.py

core_libraries:
  - torch
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - torchvision
  - numpy
  - scikit-learn
  - matplotlib
  - tqdm

metric_types:
  - scalar
  - learning_curve
  - table

standard_baselines:
  - Uncompressed teacher model
  - Vanilla knowledge distillation
  - Magnitude pruning
  - Post-training quantization (INT8)
  - Random pruning

evaluation_protocol: "Compare compressed model against full model. Report accuracy, model size, FLOPs, and inference latency. Mean +/- std over 3+ seeds."
statistical_tests:
  - paired_t_test

github_search_terms:
  - knowledge distillation pytorch
  - model pruning neural network
  - quantization aware training
  - model compression benchmark

paper_keywords:
  - knowledge distillation
  - network pruning
  - model quantization
  - model compression
  - efficient inference


================================================
FILE: researchclaw/domains/profiles/ml_generative.yaml
================================================
domain_id: ml_generative
display_name: Generative Models (ML)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: FID / IS

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Dataset loading and preprocessing"
  model.py: "Generative model architecture (generator, discriminator, encoder, decoder)"
  train.py: "Training loop with sample generation and metric tracking"
  main.py: "Entry point: setup -> train -> generate -> evaluate -> report"

entry_point: main.py

core_libraries:
  - torch
  - torchvision
  - numpy
  - scipy

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - torchvision
  - numpy
  - scipy
  - scikit-learn
  - matplotlib
  - tqdm
  - pillow

metric_types:
  - scalar
  - learning_curve
  - image_grid

standard_baselines:
  - DCGAN
  - WGAN-GP
  - VAE
  - DDPM
  - StyleGAN2

evaluation_protocol: "Train generative model, compute FID and IS on generated samples vs real data. Report mean +/- std over 3+ seeds."
statistical_tests:
  - paired_t_test

github_search_terms:
  - generative adversarial network pytorch
  - diffusion model training
  - VAE image generation
  - GAN benchmark FID

paper_keywords:
  - generative adversarial network
  - diffusion model
  - variational autoencoder
  - image generation
  - score matching


================================================
FILE: researchclaw/domains/profiles/ml_generic.yaml
================================================
domain_id: ml_generic
display_name: Machine Learning (General)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: accuracy/loss

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Dataset loading and preprocessing"
  model.py: "Model architecture definition"
  train.py: "Training loop with metric tracking"
  main.py: "Entry point: setup → train → evaluate → report"

entry_point: main.py

core_libraries:
  - torch
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - MLP
  - Linear
  - Random Forest
  - XGBoost

evaluation_protocol: "Standard train/eval split. Report mean ± std over 3+ seeds."
statistical_tests:
  - paired_t_test

github_search_terms:
  - machine learning pytorch
  - deep learning benchmark

paper_keywords:
  - machine learning
  - deep learning


================================================
FILE: researchclaw/domains/profiles/ml_graph.yaml
================================================
domain_id: ml_graph
display_name: Graph Neural Networks (ML)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: accuracy / F1

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Graph dataset loading and preprocessing"
  model.py: "GNN architecture definition (message passing, pooling)"
  train.py: "Training loop with metric tracking"
  main.py: "Entry point: setup -> train -> evaluate -> report"

entry_point: main.py

core_libraries:
  - torch
  - torch_geometric
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - torch-geometric
  - torch-scatter
  - torch-sparse
  - numpy
  - scikit-learn
  - matplotlib
  - tqdm
  - networkx
  - ogb

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - GCN
  - GAT
  - GraphSAGE
  - GIN
  - MLP (no graph structure)

evaluation_protocol: "Train on training set, evaluate on test set. Report mean +/- std over 3+ seeds. Use standard dataset splits when available."
statistical_tests:
  - paired_t_test

github_search_terms:
  - graph neural network pytorch
  - node classification PyG
  - graph classification benchmark
  - GNN message passing

paper_keywords:
  - graph neural network
  - node classification
  - graph classification
  - message passing
  - link prediction


================================================
FILE: researchclaw/domains/profiles/ml_nlp.yaml
================================================
domain_id: ml_nlp
display_name: Natural Language Processing (ML)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: accuracy/F1

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Dataset loading and tokenization"
  model.py: "Model architecture definition"
  train.py: "Training loop with metric tracking"
  main.py: "Entry point: setup → train → evaluate → report"

entry_point: main.py

core_libraries:
  - torch
  - transformers
  - datasets
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - transformers
  - datasets
  - tokenizers
  - numpy
  - scikit-learn

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - BERT-base
  - RoBERTa-base
  - DistilBERT
  - LSTM
  - TF-IDF + SVM

evaluation_protocol: "Fine-tune on training set, evaluate on test set. Report mean ± std over 3+ seeds."
statistical_tests:
  - paired_t_test

github_search_terms:
  - text classification transformers
  - NLP benchmark pytorch
  - sentiment analysis huggingface

paper_keywords:
  - natural language processing
  - text classification
  - language model


================================================
FILE: researchclaw/domains/profiles/ml_rl.yaml
================================================
domain_id: ml_rl
display_name: Reinforcement Learning
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline algorithm
  proposed: proposed algorithm
  variant: ablation
  input: environment
  metric: episode return

typical_file_structure:
  config.py: "Hyperparameters and environment configuration"
  env.py: "Environment wrappers and setup"
  agent.py: "RL agent implementation"
  train.py: "Training loop with episode tracking"
  main.py: "Entry point: setup → train → evaluate → report"

entry_point: main.py

core_libraries:
  - torch
  - gymnasium
  - stable-baselines3
  - numpy

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - gymnasium
  - stable-baselines3
  - numpy
  - matplotlib

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - PPO
  - SAC
  - TD3
  - DQN
  - A2C

evaluation_protocol: "Train for specified steps, evaluate over 100 episodes. Report mean ± std over 3+ seeds."
statistical_tests:
  - paired_t_test

github_search_terms:
  - reinforcement learning pytorch
  - RL benchmark gymnasium
  - stable baselines3 example

paper_keywords:
  - reinforcement learning
  - policy gradient
  - deep reinforcement learning


================================================
FILE: researchclaw/domains/profiles/ml_tabular.yaml
================================================
domain_id: ml_tabular
display_name: Tabular ML
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: accuracy / RMSE / AUC

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Tabular data loading, feature engineering, train/val/test splits"
  model.py: "Model definitions and hyperparameter tuning"
  evaluation.py: "Evaluation metrics and cross-validation"
  main.py: "Entry point: setup -> preprocess -> train -> evaluate -> report"

entry_point: main.py

core_libraries:
  - xgboost
  - lightgbm
  - numpy
  - scikit-learn
  - pandas

docker_image: researchclaw/sandbox-ml:latest
gpu_required: false

pip_packages:
  - xgboost
  - lightgbm
  - catboost
  - numpy
  - scikit-learn
  - pandas
  - matplotlib
  - optuna

metric_types:
  - scalar
  - table

standard_baselines:
  - XGBoost
  - LightGBM
  - CatBoost
  - Random Forest
  - Logistic Regression / Linear Regression
  - MLP

evaluation_protocol: "Use 5-fold cross-validation or fixed train/test splits. Report mean +/- std of primary metric across folds or seeds."
statistical_tests:
  - paired_t_test
  - wilcoxon_signed_rank

github_search_terms:
  - xgboost classification benchmark
  - lightgbm tabular data
  - tabular benchmark gradient boosting
  - catboost vs xgboost

paper_keywords:
  - gradient boosting
  - tabular data
  - feature engineering
  - ensemble methods
  - tree-based models


================================================
FILE: researchclaw/domains/profiles/ml_vision.yaml
================================================
domain_id: ml_vision
display_name: Computer Vision (ML)
parent_domain: ml

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline
  proposed: proposed method
  variant: ablation
  input: dataset
  metric: accuracy/loss

typical_file_structure:
  config.py: "Hyperparameters and model configuration"
  data.py: "Dataset loading with train/val/test splits"
  model.py: "Model architecture definition"
  train.py: "Training loop with metric tracking"
  main.py: "Entry point: setup → train → evaluate → report"

entry_point: main.py

core_libraries:
  - torch
  - torchvision
  - numpy
  - scikit-learn

docker_image: researchclaw/sandbox-ml:latest
gpu_required: true

pip_packages:
  - torch
  - torchvision
  - numpy
  - scikit-learn
  - matplotlib
  - tqdm

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - ResNet-18
  - ResNet-50
  - VGG-16
  - DenseNet-121
  - EfficientNet-B0

evaluation_protocol: "Train on training set, evaluate on test set. Report mean ± std over 3+ seeds."
statistical_tests:
  - paired_t_test

output_formats:
  - latex_table
  - convergence_plot

figure_types:
  - accuracy_curve
  - loss_curve
  - confusion_matrix
  - bar_chart

github_search_terms:
  - image classification pytorch
  - computer vision benchmark
  - CNN training pipeline

paper_keywords:
  - image classification
  - convolutional neural network
  - visual recognition


================================================
FILE: researchclaw/domains/profiles/neuroscience_computational.yaml
================================================
domain_id: neuroscience_computational
display_name: Computational Neuroscience
parent_domain: neuroscience

experiment_paradigm: simulation
condition_terminology:
  baseline: reference model
  proposed: proposed model
  variant: ablation / parameter variant
  input: neural data / stimulus protocol
  metric: firing rate / spike correlation / decoding accuracy

typical_file_structure:
  config.py: "Simulation parameters (neuron count, connectivity, time constants)"
  network.py: "Neural network/circuit model definition"
  neuron.py: "Single-neuron model (LIF, Hodgkin-Huxley, Izhikevich)"
  stimulus.py: "Input stimulus generation (Poisson spikes, current injection)"
  analysis.py: "Spike train analysis, firing rate, correlation, decoding"
  main.py: "Entry point: setup → simulate → analyze → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - brian2
  - neo
  - matplotlib

docker_image: researchclaw/sandbox-generic:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - brian2
  - neo
  - elephant
  - matplotlib
  - mne
  - nilearn

metric_types:
  - scalar
  - learning_curve
  - convergence

standard_baselines:
  - Leaky Integrate-and-Fire (LIF)
  - Hodgkin-Huxley
  - Izhikevich model
  - Rate-coded network

evaluation_protocol: "Run neural simulation for specified duration. Compare spike statistics (firing rate, CV of ISI, pairwise correlation). For decoding tasks, report accuracy with cross-validation."
statistical_tests:
  - paired_t_test
  - ks_test
  - permutation_test

output_formats:
  - latex_table
  - raster_plot
  - firing_rate_plot

figure_types:
  - raster_plot
  - firing_rate_histogram
  - membrane_potential_trace
  - connectivity_matrix
  - tuning_curve
  - spike_correlation_matrix

github_search_terms:
  - spiking neural network python
  - brian2 simulation
  - computational neuroscience model
  - neural population dynamics
  - neural decoding python

paper_keywords:
  - spiking neural network
  - neural dynamics
  - population coding
  - neural circuit model
  - brain-inspired computing

compute_budget_guidance: |
  Time budget for neuroscience simulations:
  - Single-neuron models: fast, can run thousands of trials
  - Small networks (< 1000 neurons): seconds per trial
  - Large networks (> 10000 neurons): use Brian2 or vectorized NumPy
  - Keep simulation duration reasonable (100ms–10s biological time)
  - Reduce network size if simulations exceed time budget

dataset_guidance: |
  Computational neuroscience experiments generate synthetic data:
  - Define neuron models and connectivity in code
  - Generate input stimuli programmatically (Poisson spike trains, step currents)
  - Use standard benchmark circuits (E/I balanced network, winner-take-all)
  - Do NOT download external neural datasets
  - For brain imaging analysis, generate synthetic fMRI/EEG signals

code_generation_hints: |
  Neuroscience simulation code requirements:
  1. Implement neuron models with biologically plausible parameters
  2. Use standard time integration (Euler for LIF, RK4 for Hodgkin-Huxley)
  3. Generate spike trains and compute standard statistics
  4. Compare models on same stimulus and initial conditions
  5. Report: firing rate (Hz), CV of ISI, pairwise correlation, Fano factor
  6. For decoding: train decoder on spike counts, report accuracy with k-fold CV
  7. Output all results to results.json

result_analysis_hints: |
  Neuroscience result analysis:
  - Firing rate: mean and std across neurons and trials
  - Regularity: coefficient of variation of inter-spike intervals (CV ISI)
  - Synchrony: pairwise spike-count correlation
  - Population: Fano factor, dimensionality of population activity
  - Decoding: accuracy, confusion matrix, information content (bits)
  - Use raster plots and PSTHs for qualitative assessment


================================================
FILE: researchclaw/domains/profiles/neuroscience_imaging.yaml
================================================
domain_id: neuroscience_imaging
display_name: Brain Imaging Analysis
parent_domain: neuroscience

experiment_paradigm: comparison
condition_terminology:
  baseline: standard analysis pipeline
  proposed: proposed analysis method
  variant: ablation / preprocessing variant
  input: imaging dataset
  metric: classification accuracy / connectivity strength

typical_file_structure:
  config.py: "Analysis parameters (TR, voxel size, atlas, frequency bands)"
  preprocess.py: "Preprocessing pipeline (motion correction, filtering)"
  features.py: "Feature extraction (ROI signals, connectivity matrices)"
  classify.py: "Decoding / classification pipeline"
  main.py: "Entry point: preprocess → extract → analyze → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - nilearn
  - mne
  - scikit-learn
  - matplotlib

docker_image: researchclaw/sandbox-generic:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - nilearn
  - mne
  - scikit-learn
  - matplotlib
  - nibabel

metric_types:
  - scalar
  - confusion

standard_baselines:
  - SVM classifier
  - Correlation-based connectivity
  - Atlas-based parcellation
  - Band-pass filtering

evaluation_protocol: "Generate synthetic imaging data or use standard atlases. Run analysis pipeline, report classification accuracy with cross-validation, or connectivity metrics."
statistical_tests:
  - paired_t_test
  - permutation_test
  - fdr_correction

output_formats:
  - latex_table
  - brain_map
  - connectivity_matrix

figure_types:
  - brain_map
  - connectivity_matrix
  - bar_chart
  - confusion_matrix

github_search_terms:
  - nilearn fmri analysis
  - mne eeg python
  - brain decoding sklearn
  - functional connectivity python

paper_keywords:
  - brain imaging
  - functional connectivity
  - neural decoding
  - fMRI analysis
  - EEG signal processing

compute_budget_guidance: |
  Time budget for brain imaging analysis:
  - Synthetic data generation: fast
  - Feature extraction from ROIs: seconds
  - Classification with cross-validation: seconds to minutes
  - Keep number of subjects/trials manageable for time budget

dataset_guidance: |
  Brain imaging experiments should generate synthetic data:
  - Use nilearn.datasets for atlas references only
  - Generate synthetic fMRI signals with known activation patterns
  - Generate synthetic EEG with known frequency components
  - Do NOT download large imaging datasets
  - Simulate realistic noise characteristics (scanner drift, eye blinks)

code_generation_hints: |
  Brain imaging analysis code:
  1. Generate synthetic imaging data with known ground truth
  2. Apply standard preprocessing (filtering, artifact removal)
  3. Extract features (ROI timeseries, connectivity, power spectra)
  4. Run classification/analysis pipeline
  5. Report accuracy with cross-validation (k-fold, leave-one-out)
  6. Compare methods on the same synthetic data
  7. Output results to results.json

result_analysis_hints: |
  Brain imaging result analysis:
  - Classification: accuracy, F1, confusion matrix (with cross-validation)
  - Connectivity: correlation strength, graph metrics (modularity, efficiency)
  - Frequency analysis: power spectral density, band power ratios
  - Use permutation tests for statistical significance
  - Apply FDR correction for multiple comparisons


================================================
FILE: researchclaw/domains/profiles/physics_pde.yaml
================================================
domain_id: physics_pde
display_name: PDE Solvers
parent_domain: physics

experiment_paradigm: convergence
condition_terminology:
  baseline: reference solver
  proposed: proposed solver
  variant: parameter variant
  input: test problem
  metric: L2 error norm

typical_file_structure:
  config.py: "Problem parameters (domain, boundary conditions, grid sizes)"
  problem.py: "PDE definition and analytical solution (if available)"
  solver.py: "Numerical solver implementation"
  analysis.py: "Error computation and convergence analysis"
  main.py: "Entry point: define problem → solve at multiple resolutions → analyze → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - matplotlib

docker_image: researchclaw/sandbox-physics:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - matplotlib
  - findiff

metric_types:
  - scalar
  - convergence

standard_baselines:
  - Finite Difference (2nd order)
  - Finite Element (P1)
  - Spectral Method
  - Crank-Nicolson

evaluation_protocol: "Solve on progressively finer grids, compute L2/L-inf error against analytical solution. Determine convergence order."
statistical_tests:
  - convergence_order_fit

output_formats:
  - latex_table
  - convergence_plot

figure_types:
  - convergence_plot
  - solution_surface_plot
  - error_distribution_plot

github_search_terms:
  - PDE solver python
  - finite difference python
  - finite element numpy
  - poisson equation solver

paper_keywords:
  - partial differential equations
  - finite element method
  - finite difference method
  - numerical PDE

compute_budget_guidance: |
  PDE solver time budget:
  - 2D problems: grid sizes 16, 32, 64, 128, 256 (5 levels)
  - 3D problems: grid sizes 8, 16, 32, 64 (4 levels)
  - Time-dependent: keep simulation time short, focus on spatial convergence

dataset_guidance: |
  PDE problems use manufactured or standard test problems:
  - Use problems with known analytical solutions for error measurement
  - Standard: Poisson, Heat equation, Wave equation, Advection
  - Define boundary conditions and source terms in code
  - Do NOT download external datasets

code_generation_hints: |
  PDE solver requirements:
  1. Implement the discretization correctly (stencils, assembly)
  2. Run at MULTIPLE grid sizes for convergence study
  3. Compute error norms: L2 = sqrt(h^d * sum((u_h - u_exact)^2))
  4. Report convergence order: p = log(e1/e2) / log(h1/h2)
  5. Handle boundary conditions properly
  6. Output results.json with convergence data:
     {"convergence": {"method": [{"h": 0.1, "l2_error": 0.05, "linf_error": 0.1}]}}

result_analysis_hints: |
  PDE convergence analysis:
  - Fit log-log regression: log(error) = p * log(h) + C
  - Expected: 2nd order FD → p≈2, 4th order → p≈4
  - Report observed vs expected convergence order
  - Flag if convergence order is significantly below expected


================================================
FILE: researchclaw/domains/profiles/physics_quantum.yaml
================================================
domain_id: physics_quantum
display_name: Quantum Mechanics / Quantum Physics
parent_domain: physics
experiment_paradigm: comparison
condition_terminology:
  baseline: reference method
  proposed: proposed method
  variant: parameter variant
  input: quantum system
  metric: energy / fidelity
typical_file_structure:
  config.py: "System parameters (potential, basis, grid)"
  system.py: "Quantum system definition"
  solver.py: "Solver implementation"
  main.py: "Entry point: setup → solve → analyze → report"
entry_point: main.py
core_libraries: [numpy, scipy, matplotlib]
docker_image: researchclaw/sandbox-physics:latest
gpu_required: false
pip_packages: [numpy, scipy, matplotlib]
metric_types: [scalar, convergence]
standard_baselines: [Exact diagonalization, Variational method, Perturbation theory]
evaluation_protocol: "Compare methods on standard quantum systems. Report energy accuracy, fidelity."
statistical_tests: [relative_error, convergence_order_fit]
github_search_terms: [quantum mechanics python, schrodinger equation solver]
paper_keywords: [quantum mechanics, Schrodinger equation]


================================================
FILE: researchclaw/domains/profiles/physics_simulation.yaml
================================================
domain_id: physics_simulation
display_name: Computational Physics (Simulation)
parent_domain: physics

experiment_paradigm: simulation
condition_terminology:
  baseline: reference method
  proposed: proposed method
  variant: parameter variant
  input: initial conditions
  metric: error norm

typical_file_structure:
  config.py: "Simulation parameters (grid, timestep, boundary conditions)"
  system.py: "Physical system definition (potentials, Hamiltonian)"
  integrator.py: "Numerical integrator implementation"
  analysis.py: "Observable computation from simulation data"
  main.py: "Entry point: setup → simulate → analyze → report"

entry_point: main.py

core_libraries:
  - numpy
  - scipy
  - matplotlib
  - jax

docker_image: researchclaw/sandbox-physics:latest
gpu_required: false

pip_packages:
  - numpy
  - scipy
  - matplotlib
  - jax
  - jaxlib

metric_types:
  - scalar
  - convergence

standard_baselines:
  - Velocity Verlet
  - Leapfrog
  - RK4
  - Euler

evaluation_protocol: "Run simulation with multiple methods, compare energy conservation and trajectory accuracy. Report energy drift, relative error."
statistical_tests:
  - convergence_order_fit
  - relative_error

output_formats:
  - latex_table
  - convergence_plot

figure_types:
  - trajectory_plot
  - energy_conservation_plot
  - convergence_plot
  - phase_space_plot

github_search_terms:
  - molecular dynamics python
  - N-body simulation numpy
  - symplectic integrator python
  - physics simulation jax

paper_keywords:
  - molecular dynamics
  - N-body simulation
  - symplectic integrator
  - energy conservation

compute_budget_guidance: |
  Time budget considerations for physics simulations:
  - Short simulations (< 100 steps): suitable for convergence tests
  - Medium simulations (100-10000 steps): typical production runs
  - Long simulations (> 10000 steps): only with efficient integrators
  Adjust timestep (dt) and number of steps to fit within the time budget.

dataset_guidance: |
  Physics simulations generate their own data:
  - Define initial conditions programmatically (positions, velocities)
  - Use standard test problems (harmonic oscillator, Kepler, Lennard-Jones)
  - Do NOT download external datasets
  - Generate test systems with known analytical solutions for validation

code_generation_hints: |
  Physics simulation code requirements:
  1. Implement actual physics — conserve energy, momentum as appropriate
  2. Use appropriate units (reduced units for MD, SI for classical mechanics)
  3. Compare methods at the SAME initial conditions and timestep
  4. For convergence studies: run at multiple dt values (e.g., 0.1, 0.05, 0.025, 0.0125)
  5. Report energy drift as relative error: |E(t) - E(0)| / |E(0)|
  6. Output all results to results.json

result_analysis_hints: |
  Physics result analysis:
  - For convergence: fit log(error) vs log(h) to determine convergence order
  - For energy conservation: report max and mean energy drift
  - For trajectory: compare against analytical solution if available
  - Use log-log plots for convergence, linear plots for time evolution


================================================
FILE: researchclaw/domains/profiles/robotics_control.yaml
================================================
domain_id: robotics_control
display_name: Robotics & Control
parent_domain: robotics

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline controller/algorithm
  proposed: proposed controller/algorithm
  variant: ablation
  input: environment
  metric: episode return / success rate

typical_file_structure:
  config.py: "Environment and training configuration"
  env.py: "Environment setup and wrappers"
  agent.py: "Control policy / RL agent"
  train.py: "Training loop"
  main.py: "Entry point: setup → train → evaluate → report"

entry_point: main.py

core_libraries:
  - gymnasium
  - mujoco
  - stable-baselines3
  - torch
  - numpy

docker_image: researchclaw/sandbox-robotics:latest
gpu_required: true

pip_packages:
  - gymnasium
  - mujoco
  - stable-baselines3
  - torch
  - numpy
  - matplotlib

metric_types:
  - scalar
  - learning_curve

standard_baselines:
  - PPO
  - SAC
  - TD3
  - PID controller

evaluation_protocol: "Train in environment, evaluate over 100 episodes. Report mean return, success rate."
statistical_tests:
  - paired_t_test

github_search_terms:
  - mujoco python control
  - robot learning gymnasium
  - manipulation RL

paper_keywords:
  - robot learning
  - control policy
  - manipulation


================================================
FILE: researchclaw/domains/profiles/security_detection.yaml
================================================
domain_id: security_detection
display_name: Security / Intrusion Detection
parent_domain: security

experiment_paradigm: comparison
condition_terminology:
  baseline: baseline detector
  proposed: proposed detector
  variant: feature variant
  input: dataset
  metric: TPR / FPR / F1

typical_file_structure:
  config.py: "Detection parameters and thresholds"
  data.py: "Data loading and feature extraction"
  model.py: "Detection model implementation"
  evaluate.py: "Evaluation with per-class metrics"
  main.py: "Entry point: load → extract features → train → evaluate → report"

entry_point: main.py

core_libraries:
  - scikit-learn
  - numpy
  - pandas

docker_image: researchclaw/sandbox-security:latest
gpu_required: false

pip_packages:
  - scikit-learn
  - numpy
  - pandas
  - matplotlib
  - xgboost

metric_types:
  - scalar
  - confusion_matrix

standard_baselines:
  - Random Forest
  - XGBoost
  - SVM
  - Isolation Forest
  - One-Class SVM

evaluation_protocol: "Train/test split. Report TPR, FPR, F1, per-class metrics."
statistical_tests:
  - paired_t_test
  - mcnemar_test

github_search_terms:
  - intrusion detection python
  - network anomaly detection
  - malware classification sklearn

paper_keywords:
  - intrusion detection
  - anomaly detection
  - network security

dataset_guidance: |
  Security detection datasets:
  - Generate synthetic network traffic or tabular data
  - Simulate normal vs attack patterns
  - Use class-imbalanced data (realistic for security)
  - Do NOT download external datasets


================================================
FILE: researchclaw/domains/prompt_adapter.py
================================================
"""Domain-aware prompt adaptation layer.

Instead of rewriting ``prompts.py`` (2395+ lines of battle-tested code),
this module wraps existing prompt blocks with domain-specific overrides
via the **adapter pattern**.

Usage::

    adapter = get_adapter(domain_profile)
    blocks = adapter.get_code_generation_blocks(context)
    # blocks dict can be injected into the existing prompt system
"""

from __future__ import annotations

import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any

from researchclaw.domains.detector import DomainProfile, is_ml_domain

logger = logging.getLogger(__name__)


@dataclass
class PromptBlocks:
    """Collection of prompt blocks for a specific pipeline stage.

    Each field is a string block that gets injected into the prompt
    template. Empty strings mean "use the default from prompts.py".
    """

    compute_budget: str = ""
    dataset_guidance: str = ""
    hp_reporting: str = ""
    code_generation_hints: str = ""
    result_analysis_hints: str = ""
    experiment_design_context: str = ""
    statistical_test_guidance: str = ""
    output_format_guidance: str = ""


class PromptAdapter(ABC):
    """Base class for domain-specific prompt adapters.

    Subclasses override methods to provide domain-specific prompt blocks.
    The ML adapter returns empty strings for everything (meaning: use the
    existing hardcoded behavior in prompts.py unchanged).
    """

    def __init__(self, domain: DomainProfile) -> None:
        self.domain = domain

    @abstractmethod
    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        """Return prompt blocks for the code generation stage."""

    @abstractmethod
    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        """Return prompt blocks for the experiment design stage."""

    @abstractmethod
    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        """Return prompt blocks for the result analysis stage."""

    def get_blueprint_context(self) -> str:
        """Extra context injected into the blueprint generation prompt.

        Includes domain-specific file structure guidance, library hints, etc.
        """
        parts: list[str] = []

        if self.domain.typical_file_structure:
            parts.append("## Recommended File Structure")
            for fname, desc in self.domain.typical_file_structure.items():
                parts.append(f"- `{fname}`: {desc}")

        if self.domain.core_libraries:
            parts.append(f"\n## Core Libraries: {', '.join(self.domain.core_libraries)}")

        if self.domain.code_generation_hints:
            parts.append(f"\n## Domain-Specific Hints\n{self.domain.code_generation_hints}")

        return "\n".join(parts)

    def get_condition_terminology(self) -> dict[str, str]:
        """Return the domain's terminology mapping."""
        return self.domain.condition_terminology


# ---------------------------------------------------------------------------
# ML Adapter — wraps ALL current behavior unchanged
# ---------------------------------------------------------------------------


class MLPromptAdapter(PromptAdapter):
    """ML adapter: returns empty blocks so the existing prompts.py behavior
    is used verbatim. This is the zero-regression guarantee.
    """

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        # Empty = use existing hardcoded ML blocks in prompts.py
        return PromptBlocks()

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks()

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        return PromptBlocks()


# ---------------------------------------------------------------------------
# Generic Adapter — LLM-knowledge-only fallback for unknown domains
# ---------------------------------------------------------------------------


class GenericPromptAdapter(PromptAdapter):
    """Generic adapter for domains without a specialized adapter.

    Uses the DomainProfile's guidance fields (loaded from YAML) to
    construct prompt blocks. Falls back to sensible generic guidance.
    """

    def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        paradigm = domain.experiment_paradigm
        libs = ", ".join(domain.core_libraries) if domain.core_libraries else "numpy, scipy"

        code_hints = domain.code_generation_hints or self._default_code_hints(paradigm, libs)
        dataset_guidance = domain.dataset_guidance or self._default_dataset_guidance(paradigm)
        hp_guidance = domain.hp_reporting_guidance or self._default_hp_guidance()

        return PromptBlocks(
            compute_budget=domain.compute_budget_guidance,
            dataset_guidance=dataset_guidance,
            hp_reporting=hp_guidance,
            code_generation_hints=code_hints,
            output_format_guidance=self._output_format_guidance(),
        )

    def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        terminology = domain.condition_terminology
        paradigm = domain.experiment_paradigm

        design_context = (
            f"This is a {domain.display_name} experiment.\n"
            f"Experiment paradigm: {paradigm}\n"
        )
        if terminology:
            design_context += "Terminology:\n"
            for key, term in terminology.items():
                design_context += f"  - {key}: {term}\n"

        if domain.standard_baselines:
            design_context += f"Standard baselines in this domain: {', '.join(domain.standard_baselines)}\n"

        stats = ", ".join(domain.statistical_tests) if domain.statistical_tests else "appropriate statistical tests"
        stat_guidance = f"Use {stats} for result significance testing."

        return PromptBlocks(
            experiment_design_context=design_context,
            statistical_test_guidance=stat_guidance,
        )

    def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks:
        domain = self.domain
        analysis_hints = domain.result_analysis_hints or ""

        if domain.statistical_tests:
            stat_guidance = (
                "Statistical tests to use for this domain:\n"
                + "\n".join(f"  - {t}" for t in domain.statistical_tests)
            )
        else:
            stat_guidance = ""

        return PromptBlocks(
            result_analysis_hints=analysis_hints,
            statistical_test_guidance=stat_guidance,
        )

    def _default_code_hints(self, paradigm: str, libs: str) -> str:
        hints = f"Core libraries for this domain: {libs}\n"
        if paradigm == "convergence":
            hints += (
                "This is a convergence study. The code should:\n"
                "1. Run the method at multiple refinement levels (e.g., grid sizes, timesteps)\n"
                "2. Compute error norms at each level\n"
                "3. Report results in a format suitable for convergence analysis\n"
                "4. Output results as JSON to results.json\n"
            )
        elif paradigm == "progressive_spec":
            hints += (
                "This uses progressive specification (common in economics):\n"
                "1. Start with a simple model (e.g., OLS)\n"
                "2. Progressively add complexity (controls, fixed effects, IV)\n"
                "3. Present results as a regression table\n"
                "4. Output results as JSON to results.json\n"
            )
        elif paradigm == "simulation":
            hints += (
                "This is a simulation study. The code should:\n"
                "1. Set up the physical/computational system\n"
                "2. Run the simulation\n"
                "3. Compute observables from simulation data\n"
                "4. Output results as JSON to results.json\n"
            )
        else:
            hints += (
                "Output all results as JSON to results.json with the structure:\n"
                '{"conditions": {"method_name": {"seed_X": {"metric": value}}}}\n'
            )
        return hints

    def _default_dataset_guidance(self, paradigm: str) -> str:
        if paradigm in ("convergence", "simulation"):
            return (
                "Data/input for this experiment should be generated programmatically.\n"
                "Define initial conditions, parameters, or test problems in code.\n"
                "Do NOT attempt to download external datasets."
            )
        return ""

    def _default_hp_guidance(self) -> str:
        return (
            "Report all experiment parameters in a dictionary printed to stdout:\n"
            "HYPERPARAMETERS: {'param1': value1, 'param2': value2, ...}"
        )

    def _output_format_guidance(self) -> str:
        domain = self.domain
        if "convergence" in domain.metric_types:
            return (
                "Output results as JSON to results.json with convergence data:\n"
                '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]}}'
            )
        if "table" in domain.metric_types:
            return (
                "Output results as JSON to results.json with table data:\n"
                '{"regression_table": {"spec_1": {"coeff": 0.15, "se": 0.03, ...}}}'
            )
        return (
            "Output results as JSON to results.json:\n"
            '{"conditions": {"method": {"seed_X": {"metric": value}}}}'
        )


# ---------------------------------------------------------------------------
# Adapter registry
# ---------------------------------------------------------------------------

# Maps domain_id prefixes to adapter classes.
# If a domain_id starts with "ml_", the ML adapter is used.
def _build_adapter_registry() -> dict[str, type[PromptAdapter]]:
    """Build the adapter registry with lazy imports for domain adapters."""
    registry: dict[str, type[PromptAdapter]] = {
        "ml_": MLPromptAdapter,
        "generic": GenericPromptAdapter,
    }
    try:
        from researchclaw.domains.adapters.physics import PhysicsPromptAdapter
        registry["physics_"] = PhysicsPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.economics import EconomicsPromptAdapter
        registry["economics_"] = EconomicsPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.biology import BiologyPromptAdapter
        registry["biology_"] = BiologyPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.chemistry import ChemistryPromptAdapter
        registry["chemistry_"] = ChemistryPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.security import SecurityPromptAdapter
        registry["security_"] = SecurityPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.math import MathPromptAdapter
        registry["mathematics_"] = MathPromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.neuroscience import NeurosciencePromptAdapter
        registry["neuroscience_"] = NeurosciencePromptAdapter
    except ImportError:
        pass
    try:
        from researchclaw.domains.adapters.robotics import RoboticsPromptAdapter
        registry["robotics_"] = RoboticsPromptAdapter
    except ImportError:
        pass
    return registry


_ADAPTER_REGISTRY: dict[str, type[PromptAdapter]] = _build_adapter_registry()


def register_adapter(domain_prefix: str, adapter_cls: type[PromptAdapter]) -> None:
    """Register a custom adapter for a domain prefix."""
    _ADAPTER_REGISTRY[domain_prefix] = adapter_cls


def get_adapter(domain: DomainProfile) -> PromptAdapter:
    """Get the appropriate PromptAdapter for a given domain.

    Lookup order:
    1. Exact domain_id match
    2. Prefix match (e.g., "ml_" for all ML domains)
    3. Generic fallback
    """
    # Exact match
    if domain.domain_id in _ADAPTER_REGISTRY:
        return _ADAPTER_REGISTRY[domain.domain_id](domain)

    # Prefix match
    for prefix, adapter_cls in _ADAPTER_REGISTRY.items():
        if prefix.endswith("_") and domain.domain_id.startswith(prefix):
            return adapter_cls(domain)

    # ML domain check
    if is_ml_domain(domain):
        return MLPromptAdapter(domain)

    # Generic fallback
    return GenericPromptAdapter(domain)


================================================
FILE: researchclaw/evolution.py
================================================
"""Self-evolution system for the ResearchClaw pipeline.

Records lessons from each pipeline run (failures, slow stages, quality issues)
and injects them into future runs as prompt overlays.  Inspired by Sibyl's
time-weighted evolution mechanism.

Architecture
------------
* ``LessonCategory`` — 6 issue categories for classification.
* ``LessonEntry`` — single lesson (stage, category, severity, description, ts).
* ``EvolutionStore`` — JSONL-backed persistent store with append + query.
* ``extract_lessons()`` — auto-extract lessons from ``StageResult`` lists.
* ``build_overlay()`` — generate per-stage prompt overlay text.

Usage
-----
::

    from researchclaw.evolution import EvolutionStore, extract_lessons

    store = EvolutionStore(Path("evolution"))
    lessons = extract_lessons(results)
    store.append_many(lessons)
    overlay = store.build_overlay("hypothesis_gen", max_lessons=5)
"""

from __future__ import annotations

import json
import logging
import math
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path

logger = logging.getLogger(__name__)


class LessonCategory(str, Enum):
    """Issue classification for extracted lessons."""

    SYSTEM = "system"          # Environment / network / timeout
    EXPERIMENT = "experiment"  # Code validation, sandbox timeout
    WRITING = "writing"        # Paper quality issues
    ANALYSIS = "analysis"      # Weak analysis, missing comparison
    LITERATURE = "literature"  # Search / verification failures
    PIPELINE = "pipeline"      # Stage orchestration issues


@dataclass
class LessonEntry:
    """A single lesson extracted from a pipeline run."""

    stage_name: str
    stage_num: int
    category: str
    severity: str  # "info", "warning", "error"
    description: str
    timestamp: str  # ISO 8601
    run_id: str = ""

    def to_dict(self) -> dict[str, object]:
        return asdict(self)

    @classmethod
    def from_dict(cls, data: dict[str, object]) -> LessonEntry:
        return cls(
            stage_name=str(data.get("stage_name", "")),
            stage_num=int(data.get("stage_num", 0)),
            category=str(data.get("category", "pipeline")),
            severity=str(data.get("severity", "info")),
            description=str(data.get("description", "")),
            timestamp=str(data.get("timestamp", "")),
            run_id=str(data.get("run_id", "")),
        )


# ---------------------------------------------------------------------------
# Lesson classification keywords
# ---------------------------------------------------------------------------

_CATEGORY_KEYWORDS: dict[str, list[str]] = {
    LessonCategory.SYSTEM: [
        "timeout", "connection", "network", "oom", "memory",
        "permission", "ssh", "socket", "dns",
    ],
    LessonCategory.EXPERIMENT: [
        "sandbox", "validation", "import", "syntax", "subprocess",
        "experiment", "code", "execution",
    ],
    LessonCategory.WRITING: [
        "paper", "draft", "outline", "revision", "review",
        "template", "latex",
    ],
    LessonCategory.ANALYSIS: [
        "analysis", "metric", "statistic", "comparison", "baseline",
    ],
    LessonCategory.LITERATURE: [
        "search", "citation", "verify", "hallucin", "arxiv",
        "semantic_scholar", "literature", "collect",
    ],
}


def _classify_error(stage_name: str, error_text: str) -> str:
    """Classify an error into a LessonCategory based on keywords."""
    combined = f"{stage_name} {error_text}".lower()
    best_category = LessonCategory.PIPELINE
    best_score = 0
    for category, keywords in _CATEGORY_KEYWORDS.items():
        score = sum(1 for kw in keywords if kw in combined)
        if score > best_score:
            best_score = score
            best_category = category
    return best_category


# ---------------------------------------------------------------------------
# Lesson extraction from pipeline results
# ---------------------------------------------------------------------------

# Stage name mapping (import-free to avoid circular deps)
_STAGE_NAMES: dict[int, str] = {
    1: "topic_init", 2: "problem_decompose", 3: "search_strategy",
    4: "literature_collect", 5: "literature_screen", 6: "knowledge_extract",
    7: "synthesis", 8: "hypothesis_gen", 9: "experiment_design",
    10: "code_generation", 11: "resource_planning", 12: "experiment_run",
    13: "iterative_refine", 14: "result_analysis", 15: "research_decision",
    16: "paper_outline", 17: "paper_draft", 18: "peer_review",
    19: "paper_revision", 20: "quality_gate", 21: "knowledge_archive",
    22: "export_publish", 23: "citation_verify",
}


def extract_lessons(
    results: list[object],
    run_id: str = "",
    run_dir: Path | None = None,
) -> list[LessonEntry]:
    """Extract lessons from a list of StageResult objects.

    Detects:
    - Failed stages → error lesson
    - Blocked stages → pipeline lesson
    - Decision pivots/refines → pipeline lesson (with rationale if available)
    - Runtime warnings from experiment stderr → code_bug lesson
    - Metric anomalies (NaN, identical convergence) → metric_anomaly lesson
    """
    now = datetime.now(timezone.utc).isoformat(timespec="seconds")
    lessons: list[LessonEntry] = []

    for result in results:
        stage_num = int(getattr(result, "stage", 0))
        stage_name = _STAGE_NAMES.get(stage_num, f"stage_{stage_num}")
        status = str(getattr(result, "status", ""))
        error = getattr(result, "error", None)
        decision = str(getattr(result, "decision", "proceed"))

        # Failed stages
        if "failed" in status.lower() and error:
            category = _classify_error(stage_name, str(error))
            lessons.append(LessonEntry(
                stage_name=stage_name,
                stage_num=stage_num,
                category=category,
                severity="error",
                description=f"Stage {stage_name} failed: {str(error)[:300]}",
                timestamp=now,
                run_id=run_id,
            ))

        # Blocked stages
        if "blocked" in status.lower():
            lessons.append(LessonEntry(
                stage_name=stage_name,
                stage_num=stage_num,
                category=LessonCategory.PIPELINE,
                severity="warning",
                description=f"Stage {stage_name} blocked awaiting approval",
                timestamp=now,
                run_id=run_id,
            ))

        # PIVOT / REFINE decisions — extract rationale if available
        if decision in ("pivot", "refine"):
            rationale = _extract_decision_rationale(run_dir) if run_dir else ""
            desc = f"Research decision was {decision.upper()}"
            if rationale:
                desc += f": {rationale[:200]}"
            else:
                desc += " — prior hypotheses/experiments were insufficient"
            lessons.append(LessonEntry(
                stage_name=stage_name,
                stage_num=stage_num,
                category=LessonCategory.PIPELINE,
                severity="warning",
                description=desc,
                timestamp=now,
                run_id=run_id,
            ))

    # --- Extract lessons from experiment artifacts ---
    if run_dir is not None:
        lessons.extend(_extract_runtime_lessons(run_dir, now, run_id))

    return lessons


def _extract_decision_rationale(run_dir: Path) -> str:
    """Extract rationale from the most recent decision_structured.json.

    Supports multiple field formats:
    - ``rationale`` or ``reason`` key (direct)
    - ``raw_text_excerpt`` containing ``## Justification`` section (LLM output)
    """
    for stage_dir in sorted(run_dir.glob("stage-15*"), reverse=True):
        decision_file = stage_dir / "decision_structured.json"
        if decision_file.exists():
            try:
                data = json.loads(decision_file.read_text(encoding="utf-8"))
                if not isinstance(data, dict):
                    continue
                # Try direct rationale/reason keys first
                direct = data.get("rationale", "") or data.get("reason", "")
                if direct:
                    return str(direct)
                # Parse raw_text_excerpt for Justification section
                raw = data.get("raw_text_excerpt", "")
                if raw:
                    return _parse_justification_from_excerpt(str(raw))
            except (json.JSONDecodeError, OSError):
                pass
    return ""


def _parse_justification_from_excerpt(text: str) -> str:
    """Extract the Justification/Rationale section from LLM decision text."""
    import re

    # Match ## Justification, ## Rationale, or similar headings
    pattern = re.compile(
        r"##\s*(?:Justification|Rationale|Reason)\s*\n(.*?)(?=\n##|\Z)",
        re.DOTALL | re.IGNORECASE,
    )
    match = pattern.search(text)
    if match:
        return match.group(1).strip()[:300]
    # Fallback: skip the first line (## Decision / **REFINE**) and return the rest
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    # Skip heading lines starting with ## or **
    content_lines = [
        l for l in lines
        if not l.startswith("##") and not (l.startswith("**") and l.endswith("**"))
    ]
    if content_lines:
        return " ".join(content_lines)[:300]
    return ""


def _extract_runtime_lessons(
    run_dir: Path, timestamp: str, run_id: str
) -> list[LessonEntry]:
    """Extract fine-grained lessons from experiment run artifacts."""
    import math

    lessons: list[LessonEntry] = []

    # Check sandbox run results for stderr warnings and NaN
    for runs_dir in run_dir.glob("stage-*/runs"):
        for run_file in runs_dir.glob("*.json"):
            if run_file.name == "results.json":
                continue
            try:
                payload = json.loads(run_file.read_text(encoding="utf-8"))
            except (json.JSONDecodeError, OSError):
                continue
            if not isinstance(payload, dict):
                continue

            # Check stderr for runtime warnings
            stderr = payload.get("stderr", "")
            if stderr and any(
                kw in stderr for kw in ("Warning", "Error", "divide", "overflow", "invalid value")
            ):
                lessons.append(LessonEntry(
                    stage_name="experiment_run",
                    stage_num=12,
                    category=LessonCategory.EXPERIMENT,
                    severity="warning",
                    description=f"Runtime warning in experiment: {stderr[:200]}",
                    timestamp=timestamp,
                    run_id=run_id,
                ))

            # Check metrics for NaN/Inf
            metrics = payload.get("metrics", {})
            if isinstance(metrics, dict):
                for key, val in metrics.items():
                    try:
                        fval = float(val)
                        if math.isnan(fval) or math.isinf(fval):
                            lessons.append(LessonEntry(
                                stage_name="experiment_run",
                                stage_num=12,
                                category=LessonCategory.EXPERIMENT,
                                severity="error",
                                description=f"Metric '{key}' was {val} — code bug (division by zero or overflow)",
                                timestamp=timestamp,
                                run_id=run_id,
                            ))
                    except (TypeError, ValueError):
                        pass

    return lessons


# ---------------------------------------------------------------------------
# Time-decay weighting
# ---------------------------------------------------------------------------

HALF_LIFE_DAYS: float = 30.0
MAX_AGE_DAYS: float = 90.0


def _time_weight(timestamp_iso: str) -> float:
    """Compute exponential decay weight for a lesson based on age.

    Uses 30-day half-life: weight = exp(-age_days * ln(2) / 30).
    Returns 0.0 for lessons older than 90 days.
    """
    try:
        ts = datetime.fromisoformat(timestamp_iso)
        if ts.tzinfo is None:
            ts = ts.replace(tzinfo=timezone.utc)
        age = datetime.now(timezone.utc) - ts
        age_days = age.total_seconds() / 86400.0
        if age_days > MAX_AGE_DAYS:
            return 0.0
        return math.exp(-age_days * math.log(2) / HALF_LIFE_DAYS)
    except (ValueError, TypeError):
        return 0.0


# ---------------------------------------------------------------------------
# Evolution store
# ---------------------------------------------------------------------------


class EvolutionStore:
    """JSONL-backed store for pipeline lessons."""

    def __init__(self, store_dir: Path) -> None:
        self._dir = store_dir
        self._dir.mkdir(parents=True, exist_ok=True)
        self._lessons_path = self._dir / "lessons.jsonl"

    @property
    def lessons_path(self) -> Path:
        return self._lessons_path

    def append(self, lesson: LessonEntry) -> None:
        """Append a single lesson to the store."""
        with self._lessons_path.open("a", encoding="utf-8") as f:
            f.write(json.dumps(lesson.to_dict(), ensure_ascii=False) + "\n")

    def append_many(self, lessons: list[LessonEntry]) -> None:
        """Append multiple lessons atomically."""
        if not lessons:
            return
        with self._lessons_path.open("a", encoding="utf-8") as f:
            for lesson in lessons:
                f.write(json.dumps(lesson.to_dict(), ensure_ascii=False) + "\n")
        logger.info("Appended %d lessons to evolution store", len(lessons))

    def load_all(self) -> list[LessonEntry]:
        """Load all lessons from disk."""
        if not self._lessons_path.exists():
            return []
        lessons: list[LessonEntry] = []
        for line in self._lessons_path.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if not line:
                continue
            try:
                data = json.loads(line)
                lessons.append(LessonEntry.from_dict(data))
            except (json.JSONDecodeError, TypeError):
                continue
        return lessons

    def query_for_stage(
        self, stage_name: str, *, max_lessons: int = 5
    ) -> list[LessonEntry]:
        """Return the most relevant lessons for a stage, weighted by recency.

        Includes lessons that directly match the stage, plus high-severity
        lessons from related stages.
        """
        all_lessons = self.load_all()
        scored: list[tuple[float, LessonEntry]] = []
        for lesson in all_lessons:
            weight = _time_weight(lesson.timestamp)
            if weight <= 0.0:
                continue
            # Boost direct stage matches
            if lesson.stage_name == stage_name:
                weight *= 2.0
            # Boost errors over warnings/info
            if lesson.severity == "error":
                weight *= 1.5
            scored.append((weight, lesson))
        scored.sort(key=lambda x: x[0], reverse=True)
        return [entry for _, entry in scored[:max_lessons]]

    def build_overlay(
        self,
        stage_name: str,
        *,
        max_lessons: int = 5,
        skills_dir: str = "",
    ) -> str:
        """Generate a prompt overlay string for a given stage.

        Combines two sources:
        1. Current-run lessons from ``lessons.jsonl`` (intra-run learning).
        2. Cross-run MetaClaw ``arc-*`` skills from *skills_dir* (inter-run
           learning via the MetaClaw skill-generation feedback loop).

        Returns empty string if no relevant lessons or skills exist.
        """
        parts: list[str] = []

        # --- Section 1: intra-run lessons ---
        lessons = self.query_for_stage(stage_name, max_lessons=max_lessons)
        if lessons:
            parts.append("## Lessons from Prior Runs")
            for i, lesson in enumerate(lessons, 1):
                severity_icon = {"error": "❌", "warning": "⚠️", "info": "ℹ️"}.get(
                    lesson.severity, "•"
                )
                parts.append(
                    f"{i}. {severity_icon} [{lesson.category}] {lesson.description}"
                )
            parts.append(
                "\nUse these lessons to avoid repeating past mistakes."
            )

        # --- Section 2: cross-run MetaClaw arc-* skills ---
        if skills_dir:
            from pathlib import Path as _Path

            sd = _Path(skills_dir).expanduser()
            if sd.is_dir():
                arc_skills: list[str] = []
                for skill_dir in sorted(sd.iterdir()):
                    if skill_dir.is_dir() and skill_dir.name.startswith("arc-"):
                        skill_file = skill_dir / "SKILL.md"
                        if skill_file.is_file():
                            try:
                                text = skill_file.read_text(encoding="utf-8").strip()
                                if text:
                                    arc_skills.append(text)
                            except OSError:
                                continue
                if arc_skills:
                    parts.append("\n## Learned Skills from Prior Runs")
                    for skill_text in arc_skills[:5]:
                        parts.append(skill_text)
                    parts.append(
                        "\nApply these skills proactively to improve quality."
                    )

        return "\n".join(parts)

    def count(self) -> int:
        """Return total number of stored lessons."""
        return len(self.load_all())

    def export_to_memory(self, memory_store: object) -> int:
        """Export lessons to a memory store (duck-typed to avoid circular imports).

        The *memory_store* must expose an ``add(content, category, metadata)`` method
        (compatible with ``researchclaw.memory.store.MemoryStore``).

        Returns the number of lessons exported.
        """
        add_fn = getattr(memory_store, "add", None)
        if add_fn is None or not callable(add_fn):
            logger.warning("export_to_memory: memory_store has no add() method")
            return 0
        lessons = self.load_all()
        exported = 0
        for lesson in lessons:
            weight = _time_weight(lesson.timestamp)
            if weight <= 0.0:
                continue
            try:
                # Map lesson categories to valid MemoryStore categories
                _CAT_MAP = {
                    "system": "experiment", "analysis": "experiment",
                    "literature": "ideation", "pipeline": "experiment",
                    "experiment": "experiment", "writing": "writing",
                    "ideation": "ideation",
                }
                _mem_cat = _CAT_MAP.get(lesson.category, "experiment")
                add_fn(
                    content=lesson.description,
                    category=_mem_cat,
                    metadata={
                        "source": "evolution",
                        "stage": lesson.stage_name,
                        "severity": lesson.severity,
                        "run_id": lesson.run_id,
                        "timestamp": lesson.timestamp,
                    },
                )
                exported += 1
            except Exception:
                logger.debug("Failed to export lesson: %s", lesson.description[:80])
        return exported

    def get_lessons_for_stage_with_memory(
        self,
        stage_name: str,
        memory_store: object,
        *,
        max_lessons: int = 5,
    ) -> str:
        """Combine evolution overlay with memory context for a stage.

        *memory_store* must expose a ``recall(query, category, max_results)`` method
        returning objects with a ``.content`` attribute.
        """
        overlay = self.build_overlay(stage_name, max_lessons=max_lessons)
        recall_fn = getattr(memory_store, "recall", None)
        if recall_fn is None or not callable(recall_fn):
            return overlay
        try:
            memories = recall_fn(
                query=stage_name,
                category=None,
                max_results=max_lessons,
            )
            if memories:
                parts = ["\n## Recalled Memories"]
                for i, mem in enumerate(memories, 1):
                    content = getattr(mem, "content", str(mem))
                    parts.append(f"{i}. {content}")
                memory_text = "\n".join(parts)
                return f"{overlay}\n{memory_text}" if overlay else memory_text
        except Exception:
            logger.debug("Failed to recall memories for stage %s", stage_name)
        return overlay


================================================
FILE: researchclaw/experiment/__init__.py
================================================
"""Experiment execution — sandbox, runner, git manager."""

from researchclaw.experiment.factory import create_sandbox
from researchclaw.experiment.sandbox import (
    ExperimentSandbox,
    SandboxProtocol,
    SandboxResult,
    parse_metrics,
)

__all__ = [
    "ExperimentSandbox",
    "SandboxProtocol",
    "SandboxResult",
    "create_sandbox",
    "parse_metrics",
]


================================================
FILE: researchclaw/experiment/agentic_sandbox.py
================================================
"""Agentic sandbox: launches a coding agent inside a Docker container.

The agent (e.g. Claude Code, Codex) gets full shell access and can run
arbitrary CLI commands, read/write files, and iteratively complete the
experiment.  This replaces the traditional code-generation + sandbox-execution
pipeline with a single agentic session.
"""

from __future__ import annotations

import json
import logging
import os
import subprocess
import threading
import time
from dataclasses import dataclass, field
from pathlib import Path

from researchclaw.config import AgenticConfig
from researchclaw.experiment.sandbox import SandboxResult, parse_metrics

logger = logging.getLogger(__name__)

_CONTAINER_COUNTER = 0
_counter_lock = threading.Lock()


def _next_container_name() -> str:
    global _CONTAINER_COUNTER  # noqa: PLW0603
    with _counter_lock:
        _CONTAINER_COUNTER += 1
        return f"rc-agentic-{_CONTAINER_COUNTER}-{os.getpid()}"


@dataclass
class AgenticResult:
    """Result of an agentic experiment session."""

    returncode: int
    stdout: str
    stderr: str
    elapsed_sec: float
    output_files: list[str] = field(default_factory=list)
    output_dirs: list[str] = field(default_factory=list)
    metrics: dict[str, float] = field(default_factory=dict)
    agent_log: str = ""
    steps_completed: int = 0


class AgenticSandbox:
    """Run a coding agent inside a Docker container with full shell access."""

    def __init__(
        self,
        config: AgenticConfig,
        workdir: Path,
        skills_dir: Path | None = None,
    ) -> None:
        self.config = config
        self.workdir = workdir.resolve()
        self.workdir.mkdir(parents=True, exist_ok=True)
        self.skills_dir = skills_dir
        self._container_name: str | None = None

    # -- public API ----------------------------------------------------------

    def run_agent_session(
        self,
        prompt: str,
        workspace: Path,
        *,
        timeout_sec: int | None = None,
    ) -> AgenticResult:
        """Launch the agent inside Docker, send *prompt*, and collect results.

        1. ``docker run -d`` a long-lived container
        2. Install agent CLI (if ``agent_install_cmd`` is set)
        3. ``docker exec`` the agent with *prompt*
        4. Collect output files from ``/workspace``
        5. Stop + remove the container
        """
        timeout = timeout_sec or self.config.timeout_sec
        container = _next_container_name()
        self._container_name = container

        workspace = workspace.resolve()
        workspace.mkdir(parents=True, exist_ok=True)

        start = time.monotonic()
        try:
            # 1. Start the container
            self._start_container(container, workspace)

            # 2. Install agent CLI
            if self.config.agent_install_cmd:
                self._docker_exec(
                    container,
                    self.config.agent_install_cmd,
                    timeout=min(300, timeout),
                )

            # 3. Run the agent
            agent_cmd = self._build_agent_command(prompt)
            proc = self._docker_exec(
                container,
                agent_cmd,
                timeout=timeout,
            )
            stdout = proc.stdout or ""
            stderr = proc.stderr or ""
            returncode = proc.returncode

            # 4. Collect results
            output_files, output_dirs = self._collect_outputs(workspace)
            metrics = self._parse_result_metrics(workspace, stdout)
            agent_log = stdout
            steps = self._count_agent_steps(stdout)

            elapsed = time.monotonic() - start
            return AgenticResult(
                returncode=returncode,
                stdout=stdout,
                stderr=stderr,
                elapsed_sec=elapsed,
                output_files=output_files,
                output_dirs=output_dirs,
                metrics=metrics,
                agent_log=agent_log,
                steps_completed=steps,
            )
        except subprocess.TimeoutExpired:
            elapsed = time.monotonic() - start
            logger.warning(
                "Agentic session timed out after %ds (container %s)",
                timeout,
                container,
            )
            # Still try to collect partial results
            output_files, output_dirs = self._collect_outputs(workspace)
            metrics = self._parse_result_metrics(workspace, "")
            return AgenticResult(
                returncode=-1,
                stdout="",
                stderr=f"Agent session timed out after {timeout}s",
                elapsed_sec=elapsed,
                output_files=output_files,
                output_dirs=output_dirs,
                metrics=metrics,
                agent_log="",
                steps_completed=0,
            )
        except Exception as exc:
            elapsed = time.monotonic() - start
            logger.exception("Agentic session failed: %s", exc)
            return AgenticResult(
                returncode=-1,
                stdout="",
                stderr=str(exc),
                elapsed_sec=elapsed,
            )
        finally:
            self._cleanup_container(container)

    def to_sandbox_result(self, result: AgenticResult) -> SandboxResult:
        """Convert an AgenticResult to a SandboxResult for pipeline compat."""
        return SandboxResult(
            returncode=result.returncode,
            stdout=result.stdout,
            stderr=result.stderr,
            elapsed_sec=result.elapsed_sec,
            metrics={k: v for k, v in result.metrics.items()},
            timed_out=(result.returncode == -1 and "timed out" in result.stderr),
        )

    # -- Docker helpers ------------------------------------------------------

    def _start_container(self, container: str, workspace: Path) -> None:
        """Start a long-lived Docker container with workspace mounted."""
        cmd = [
            "docker", "run", "-d",
            "--name", container,
            "-v", f"{workspace}:/workspace",
            "-w", "/workspace",
            f"--memory={self.config.memory_limit_mb}m",
        ]

        # Mount skills directory as read-only reference
        if self.config.mount_skills and self.skills_dir and self.skills_dir.is_dir():
            cmd.extend(["-v", f"{self.skills_dir}:/skills:ro"])

        # Network
        if self.config.network_policy == "none":
            cmd.extend(["--network", "none"])

        # GPU passthrough
        if self.config.gpu_enabled:
            cmd.extend(["--gpus", "all"])

        cmd.extend([self.config.image, "tail", "-f", "/dev/null"])

        logger.info("Starting agentic container: %s", container)
        subprocess.run(cmd, check=True, capture_output=True, text=True)

    def _docker_exec(
        self,
        container: str,
        command: str,
        *,
        timeout: int = 300,
    ) -> subprocess.CompletedProcess[str]:
        """Run a command inside the container."""
        cmd = ["docker", "exec", container, "bash", "-c", command]
        return subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout,
            check=False,
        )

    def _build_agent_command(self, prompt: str) -> str:
        """Build the shell command to invoke the agent CLI."""
        cli = self.config.agent_cli
        max_turns = self.config.max_turns

        import shlex as _shlex
        escaped = _shlex.quote(prompt)

        if cli == "claude":
            # Claude Code CLI
            return (
                f"{cli} -p {escaped} "
                f"--output-format json "
                f"--max-turns {max_turns} "
                f"--allowedTools 'Bash(*)' 'Read' 'Write' 'Edit' 'Glob' 'Grep'"
            )
        elif cli == "codex":
            # OpenAI Codex CLI
            return f"{cli} --quiet --approval-mode full-auto {escaped}"
        else:
            # Generic: pass prompt via -p flag
            return f"{cli} -p {escaped}"

    def _cleanup_container(self, container: str) -> None:
        """Stop and remove the container."""
        try:
            subprocess.run(
                ["docker", "stop", "-t", "5", container],
                capture_output=True,
                timeout=30,
                check=False,
            )
            subprocess.run(
                ["docker", "rm", "-f", container],
                capture_output=True,
                timeout=30,
                check=False,
            )
            logger.debug("Cleaned up container %s", container)
        except Exception:  # noqa: BLE001
            logger.warning("Failed to cleanup container %s", container)

    # -- Result collection ---------------------------------------------------

    @staticmethod
    def _collect_outputs(workspace: Path) -> tuple[list[str], list[str]]:
        """Walk workspace and return lists of output files and directories."""
        output_files: list[str] = []
        output_dirs: list[str] = []
        if not workspace.exists():
            return output_files, output_dirs
        for item in sorted(workspace.rglob("*")):
            rel = str(item.relative_to(workspace))
            if item.is_dir():
                output_dirs.append(rel)
            elif item.is_file():
                output_files.append(rel)
        return output_files, output_dirs

    @staticmethod
    def _parse_result_metrics(
        workspace: Path, stdout: str
    ) -> dict[str, float]:
        """Parse metrics from results.json (preferred) or stdout."""
        metrics: dict[str, float] = {}

        # Try results.json first
        results_json = workspace / "results.json"
        if results_json.exists():
            try:
                data = json.loads(results_json.read_text(encoding="utf-8"))
                if isinstance(data, dict):
                    # Flatten metrics from various common formats
                    raw = data.get("metrics", data)
                    for k, v in raw.items():
                        try:
                            metrics[k] = float(v)
                        except (TypeError, ValueError):
                            pass
            except (json.JSONDecodeError, OSError):
                pass

        # Fall back to stdout metric parsing
        if not metrics and stdout:
            metrics = parse_metrics(stdout)

        return metrics

    @staticmethod
    def _count_agent_steps(stdout: str) -> int:
        """Estimate the number of agent turns from the output."""
        # For JSON-format Claude output, count tool-use entries
        try:
            data = json.loads(stdout)
            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                # Claude Code JSON output has a "messages" or similar key
                messages = data.get("messages", data.get("turns", []))
                if isinstance(messages, list):
                    return len(messages)
        except (json.JSONDecodeError, TypeError):
            pass
        # Fallback: count lines that look like agent actions
        count = 0
        for line in stdout.splitlines():
            stripped = line.strip()
            if stripped.startswith(("$", ">>>", ">>", "claude>", "Agent:")):
                count += 1
        return count

    # -- Static checks -------------------------------------------------------

    @staticmethod
    def check_docker_available() -> bool:
        """Return True if Docker daemon is reachable."""
        try:
            result = subprocess.run(
                ["docker", "info"],
                capture_output=True,
                timeout=10,
                check=False,
            )
            return result.returncode == 0
        except (FileNotFoundError, subprocess.TimeoutExpired):
            return False


================================================
FILE: researchclaw/experiment/code_agent.py
================================================
"""Pluggable code-generation backends for experiment Stages 10 & 13.

Supports three providers:
  - ``llm``         — existing LLM chat API (backward-compatible default)
  - ``claude_code`` — Claude Code CLI (``claude -p``)
  - ``codex``       — OpenAI Codex CLI (``codex exec``)

Usage::

    from researchclaw.experiment.code_agent import create_code_agent

    agent = create_code_agent(config, llm=llm_client, prompts=pm)
    result = agent.generate(exp_plan=plan, topic=topic, ...)
    if result.ok:
        files = result.files  # dict[str, str]
"""

from __future__ import annotations

import logging
import os
import shutil
import signal
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Protocol

from researchclaw.config import RCConfig

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Result dataclass
# ---------------------------------------------------------------------------

@dataclass(frozen=True)
class CodeAgentResult:
    """Output from a code agent invocation."""

    files: dict[str, str]  # filename -> code content
    provider_name: str  # "llm", "claude_code", "codex"
    elapsed_sec: float
    raw_output: str = ""
    error: str | None = None

    @property
    def ok(self) -> bool:
        return self.error is None and bool(self.files)


# ---------------------------------------------------------------------------
# Protocol
# ---------------------------------------------------------------------------

class CodeAgentProvider(Protocol):
    """Protocol for code generation backends."""

    @property
    def name(self) -> str: ...

    def generate(
        self,
        *,
        exp_plan: str,
        topic: str,
        metric_key: str,
        pkg_hint: str,
        compute_budget: str,
        extra_guidance: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        """Generate experiment code from scratch (Stage 10)."""
        ...

    def refine(
        self,
        *,
        current_files: dict[str, str],
        run_summaries: list[str],
        metric_key: str,
        metric_direction: str,
        topic: str,
        extra_hints: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        """Refine existing experiment code based on run results (Stage 13)."""
        ...

    def repair(
        self,
        *,
        files: dict[str, str],
        issues: str,
        workdir: Path,
        timeout_sec: int = 300,
    ) -> CodeAgentResult:
        """Fix validation or runtime issues in code."""
        ...


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _to_text(value: str | bytes | None) -> str:
    if value is None:
        return ""
    if isinstance(value, bytes):
        return value.decode("utf-8", errors="replace")
    return value


def _collect_py_files(workdir: Path) -> dict[str, str]:
    """Read all .py files from a directory (flat, no subdirs)."""
    files: dict[str, str] = {}
    for pyfile in sorted(workdir.glob("*.py")):
        if pyfile.name.startswith("_codex_") or pyfile.name.startswith("_agent_"):
            continue
        files[pyfile.name] = pyfile.read_text(encoding="utf-8")
    return files


def _seed_workdir(workdir: Path, files: dict[str, str]) -> None:
    """Pre-populate workdir with files for refinement/repair."""
    workdir.mkdir(parents=True, exist_ok=True)
    for fname, content in files.items():
        (workdir / fname).write_text(content, encoding="utf-8")


def format_feedback_for_agent(
    sandbox_result: Any,
    metric_key: str,
    metric_direction: str,
    best_metric: float | None,
) -> str:
    """Format sandbox run results as structured feedback for CLI agents."""
    parts = ["## Previous Run Results"]
    parts.append(f"Return code: {sandbox_result.returncode}")
    parts.append(f"Elapsed: {sandbox_result.elapsed_sec:.1f}s")
    parts.append(f"Timed out: {sandbox_result.timed_out}")
    if sandbox_result.metrics:
        parts.append("Metrics:")
        for k, v in sandbox_result.metrics.items():
            parts.append(f"  {k}: {v}")
    if sandbox_result.stderr:
        parts.append(f"Stderr (last 1000 chars):\n{sandbox_result.stderr[-1000:]}")
    parts.append(f"\nTarget: {metric_direction} '{metric_key}'")
    if best_metric is not None:
        parts.append(f"Best so far: {best_metric}")
    return "\n".join(parts)


# ---------------------------------------------------------------------------
# LlmCodeAgent — wraps existing LLM chat API (backward-compatible)
# ---------------------------------------------------------------------------

class LlmCodeAgent:
    """Code agent backed by the existing OpenAI-compatible LLM chat API.

    This implementation extracts the LLM call + response parsing logic that was
    previously inline in ``_execute_code_generation`` and
    ``_execute_iterative_refine``, preserving exact behavior.
    """

    def __init__(
        self,
        llm: Any,
        prompts: Any,
        config: RCConfig,
    ) -> None:
        self._llm = llm
        self._pm = prompts
        self._config = config

    @property
    def name(self) -> str:
        return "llm"

    def generate(
        self,
        *,
        exp_plan: str,
        topic: str,
        metric_key: str,
        pkg_hint: str,
        compute_budget: str,
        extra_guidance: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        from researchclaw.pipeline.executor import (
            _chat_with_prompt,
            _extract_multi_file_blocks,
        )

        start = time.monotonic()
        sp = self._pm.for_stage(
            "code_generation",
            topic=topic,
            metric=metric_key,
            pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
            exp_plan=exp_plan,
        )
        # Higher max_tokens for reasoning models
        _code_max_tokens = sp.max_tokens or 8192
        if any(
            self._config.llm.primary_model.startswith(p)
            for p in ("gpt-5", "o3", "o4")
        ):
            _code_max_tokens = max(_code_max_tokens, 16384)

        try:
            resp = _chat_with_prompt(
                self._llm,
                sp.system,
                sp.user,
                json_mode=sp.json_mode,
                max_tokens=_code_max_tokens,
            )
            files = _extract_multi_file_blocks(resp.content)
            # Retry on empty response with higher token limit
            if not files and not resp.content.strip():
                logger.warning(
                    "LlmCodeAgent: empty response (len=%d, finish=%s). "
                    "Retrying with 32768 tokens.",
                    len(resp.content),
                    resp.finish_reason,
                )
                resp = _chat_with_prompt(
                    self._llm,
                    sp.system,
                    sp.user,
                    json_mode=sp.json_mode,
                    max_tokens=32768,
                )
                files = _extract_multi_file_blocks(resp.content)

            elapsed = time.monotonic() - start
            if not files:
                logger.warning(
                    "LlmCodeAgent: no files extracted (resp len=%d)",
                    len(resp.content),
                )
            return CodeAgentResult(
                files=files,
                provider_name="llm",
                elapsed_sec=elapsed,
                raw_output=resp.content[:2000],
            )
        except Exception as exc:
            elapsed = time.monotonic() - start
            logger.error("LlmCodeAgent.generate failed: %s", exc)
            return CodeAgentResult(
                files={},
                provider_name="llm",
                elapsed_sec=elapsed,
                error=str(exc),
            )

    def refine(
        self,
        *,
        current_files: dict[str, str],
        run_summaries: list[str],
        metric_key: str,
        metric_direction: str,
        topic: str,
        extra_hints: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        from researchclaw.pipeline.executor import (
            _chat_with_prompt,
            _extract_code_block,
            _extract_multi_file_blocks,
        )

        start = time.monotonic()

        def _files_to_context(project_files: dict[str, str]) -> str:
            parts = []
            for fname, code in sorted(project_files.items()):
                parts.append(f"```filename:{fname}\n{code}\n```")
            return "\n\n".join(parts)

        try:
            ip = self._pm.sub_prompt(
                "iterative_improve",
                metric_key=metric_key,
                metric_direction=metric_direction,
                files_context=_files_to_context(current_files),
                run_summaries=chr(10).join(run_summaries[:20]),
                condition_coverage_hint="",
                topic=topic,
            )
            user_prompt = ip.user + extra_hints

            response = _chat_with_prompt(
                self._llm,
                ip.system,
                user_prompt,
                max_tokens=ip.max_tokens or 8192,
            )
            extracted_files = _extract_multi_file_blocks(response.content)
            if not extracted_files:
                single_code = _extract_code_block(response.content)
                if single_code.strip():
                    extracted_files = {"main.py": single_code}

            elapsed = time.monotonic() - start
            return CodeAgentResult(
                files=extracted_files,
                provider_name="llm",
                elapsed_sec=elapsed,
                raw_output=response.content[:2000],
            )
        except Exception as exc:
            elapsed = time.monotonic() - start
            logger.error("LlmCodeAgent.refine failed: %s", exc)
            return CodeAgentResult(
                files={},
                provider_name="llm",
                elapsed_sec=elapsed,
                error=str(exc),
            )

    def repair(
        self,
        *,
        files: dict[str, str],
        issues: str,
        workdir: Path,
        timeout_sec: int = 300,
    ) -> CodeAgentResult:
        from researchclaw.pipeline.executor import (
            _chat_with_prompt,
            _extract_code_block,
            _extract_multi_file_blocks,
        )

        start = time.monotonic()
        all_files_ctx = "\n\n".join(
            f"```filename:{f}\n{c}\n```" for f, c in files.items()
        )
        try:
            rp = self._pm.sub_prompt(
                "code_repair",
                fname="main.py",
                issues_text=issues,
                all_files_ctx=all_files_ctx,
            )
            resp = _chat_with_prompt(self._llm, rp.system, rp.user)
            # Try multi-file extraction first, then single-block
            repaired = _extract_multi_file_blocks(resp.content)
            if not repaired:
                code = _extract_code_block(resp.content)
                if code.strip():
                    repaired = {"main.py": code}

            elapsed = time.monotonic() - start
            return CodeAgentResult(
                files=repaired,
                provider_name="llm",
                elapsed_sec=elapsed,
                raw_output=resp.content[:2000],
            )
        except Exception as exc:
            elapsed = time.monotonic() - start
            return CodeAgentResult(
                files={},
                provider_name="llm",
                elapsed_sec=elapsed,
                error=str(exc),
            )


# ---------------------------------------------------------------------------
# CLI agent base — shared subprocess logic for Claude Code / Codex
# ---------------------------------------------------------------------------

class _CliAgentBase:
    """Shared infrastructure for CLI-based coding agents."""

    _provider_name: str = ""

    def __init__(
        self,
        binary_path: str,
        model: str = "",
        max_budget_usd: float = 5.0,
        timeout_sec: int = 600,
        extra_args: list[str] | None = None,
    ) -> None:
        self._binary = binary_path
        self._model = model
        self._max_budget_usd = max_budget_usd
        self._default_timeout = timeout_sec
        self._extra_args = extra_args or []

    @property
    def name(self) -> str:
        return self._provider_name

    def _run_subprocess(
        self,
        cmd: list[str],
        workdir: Path,
        timeout_sec: int,
    ) -> tuple[int, str, str, float, bool]:
        """Run command as subprocess with process-group cleanup on timeout.

        Returns (returncode, stdout, stderr, elapsed_sec, timed_out).
        """
        workdir.mkdir(parents=True, exist_ok=True)
        start = time.monotonic()
        timed_out = False
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            cwd=workdir,
            env={**os.environ},
            start_new_session=True,
        )
        try:
            stdout_bytes, stderr_bytes = proc.communicate(timeout=timeout_sec)
        except subprocess.TimeoutExpired:
            timed_out = True
            # Kill entire process group
            try:
                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
            except OSError:
                pass
            try:
                stdout_bytes, stderr_bytes = proc.communicate(timeout=5)
            except subprocess.TimeoutExpired:
                try:
                    os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
                except OSError:
                    pass
                stdout_bytes, stderr_bytes = proc.communicate(timeout=5)

        elapsed = time.monotonic() - start
        return (
            proc.returncode or -1,
            _to_text(stdout_bytes),
            _to_text(stderr_bytes),
            elapsed,
            timed_out,
        )

    def _build_result(
        self,
        workdir: Path,
        returncode: int,
        stdout: str,
        stderr: str,
        elapsed: float,
        timed_out: bool,
    ) -> CodeAgentResult:
        """Collect .py files from workdir and build result."""
        files = _collect_py_files(workdir)
        error = None
        if timed_out:
            error = f"Timed out after {elapsed:.0f}s"
        elif returncode != 0 and not files:
            error = f"Exited {returncode}: {stderr[:500]}"
        return CodeAgentResult(
            files=files,
            provider_name=self._provider_name,
            elapsed_sec=elapsed,
            raw_output=stdout[:3000],
            error=error,
        )

    @staticmethod
    def _generate_prompt(
        topic: str,
        exp_plan: str,
        metric_key: str,
        pkg_hint: str,
        compute_budget: str,
        extra_guidance: str,
    ) -> str:
        return (
            "You are generating experiment code for a research paper.\n\n"
            f"TOPIC: {topic}\n\n"
            f"EXPERIMENT PLAN:\n{exp_plan}\n\n"
            f"PRIMARY METRIC: {metric_key}\n"
            f"{pkg_hint}\n{compute_budget}\n{extra_guidance}\n\n"
            "INSTRUCTIONS:\n"
            "1. Create a multi-file Python project in the current directory.\n"
            "2. The entry point MUST be main.py.\n"
            "3. main.py must print metrics as 'name: value' lines to stdout.\n"
            f"4. Use condition labels: 'condition=<name> {metric_key}: <value>'\n"
            "5. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket, "
            "network calls, external data files.\n"
            "6. Use deterministic seeds (numpy.random.seed or random.seed).\n"
            "7. Write ALL files to the current working directory.\n"
            "8. Do NOT create subdirectories.\n"
        )

    @staticmethod
    def _refine_prompt(
        current_files: dict[str, str],
        run_summaries: list[str],
        metric_key: str,
        metric_direction: str,
        topic: str,
        extra_hints: str,
    ) -> str:
        files_listing = "\n".join(
            f"  - {fname} ({len(code)} chars)" for fname, code in current_files.items()
        )
        summaries_text = "\n".join(run_summaries[:10]) if run_summaries else "(no prior runs)"
        return (
            "You are improving experiment code for a research paper.\n\n"
            f"TOPIC: {topic}\n"
            f"TARGET: {metric_direction} '{metric_key}'\n\n"
            f"EXISTING FILES in current directory:\n{files_listing}\n\n"
            "Read the existing files, then improve them based on these run results:\n\n"
            f"PRIOR RUN SUMMARIES:\n{summaries_text}\n\n"
            f"{extra_hints}\n\n"
            "INSTRUCTIONS:\n"
            "1. Read existing code, understand the experiment structure.\n"
            "2. Modify files to improve the metric.\n"
            "3. Keep the entry point as main.py.\n"
            "4. Write modified files to the current directory.\n"
            "5. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket.\n"
        )

    @staticmethod
    def _repair_prompt(
        files: dict[str, str],
        issues: str,
    ) -> str:
        files_listing = "\n".join(
            f"  - {fname} ({len(code)} chars)" for fname, code in files.items()
        )
        return (
            "The experiment code has validation or runtime issues.\n\n"
            f"ISSUES:\n{issues}\n\n"
            f"FILES in current directory:\n{files_listing}\n\n"
            "INSTRUCTIONS:\n"
            "1. Read the existing files in the current directory.\n"
            "2. Fix ALL reported issues.\n"
            "3. Write the corrected files back.\n"
            "4. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket.\n"
        )


# ---------------------------------------------------------------------------
# ClaudeCodeAgent
# ---------------------------------------------------------------------------

class ClaudeCodeAgent(_CliAgentBase):
    """Code agent backed by Claude Code CLI (``claude -p``)."""

    _provider_name = "claude_code"

    def _build_cmd(self, prompt: str, workdir: Path) -> list[str]:
        cmd = [
            self._binary,
            "-p", prompt,
            "--dangerously-skip-permissions",
            "--output-format", "text",
            "--allowed-tools", "Bash Edit Write Read",
            "--add-dir", str(workdir),
        ]
        if self._model:
            cmd += ["--model", self._model]
        if self._max_budget_usd:
            cmd += ["--max-budget-usd", str(self._max_budget_usd)]
        cmd.extend(self._extra_args)
        return cmd

    def generate(
        self,
        *,
        exp_plan: str,
        topic: str,
        metric_key: str,
        pkg_hint: str,
        compute_budget: str,
        extra_guidance: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        prompt = self._generate_prompt(
            topic, exp_plan, metric_key, pkg_hint, compute_budget, extra_guidance,
        )
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)

    def refine(
        self,
        *,
        current_files: dict[str, str],
        run_summaries: list[str],
        metric_key: str,
        metric_direction: str,
        topic: str,
        extra_hints: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        _seed_workdir(workdir, current_files)
        prompt = self._refine_prompt(
            current_files, run_summaries, metric_key, metric_direction,
            topic, extra_hints,
        )
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)

    def repair(
        self,
        *,
        files: dict[str, str],
        issues: str,
        workdir: Path,
        timeout_sec: int = 300,
    ) -> CodeAgentResult:
        _seed_workdir(workdir, files)
        prompt = self._repair_prompt(files, issues)
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)


# ---------------------------------------------------------------------------
# CodexAgent
# ---------------------------------------------------------------------------

class CodexAgent(_CliAgentBase):
    """Code agent backed by OpenAI Codex CLI (``codex exec``)."""

    _provider_name = "codex"

    def _build_cmd(self, prompt: str, workdir: Path) -> list[str]:
        cmd = [
            self._binary,
            "exec", prompt,
            "--sandbox", "workspace-write",
            "--json",
            "-C", str(workdir),
        ]
        if self._model:
            cmd += ["-m", self._model]
        cmd.extend(self._extra_args)
        return cmd

    def generate(
        self,
        *,
        exp_plan: str,
        topic: str,
        metric_key: str,
        pkg_hint: str,
        compute_budget: str,
        extra_guidance: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        prompt = self._generate_prompt(
            topic, exp_plan, metric_key, pkg_hint, compute_budget, extra_guidance,
        )
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)

    def refine(
        self,
        *,
        current_files: dict[str, str],
        run_summaries: list[str],
        metric_key: str,
        metric_direction: str,
        topic: str,
        extra_hints: str,
        workdir: Path,
        timeout_sec: int = 600,
    ) -> CodeAgentResult:
        _seed_workdir(workdir, current_files)
        prompt = self._refine_prompt(
            current_files, run_summaries, metric_key, metric_direction,
            topic, extra_hints,
        )
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)

    def repair(
        self,
        *,
        files: dict[str, str],
        issues: str,
        workdir: Path,
        timeout_sec: int = 300,
    ) -> CodeAgentResult:
        _seed_workdir(workdir, files)
        prompt = self._repair_prompt(files, issues)
        cmd = self._build_cmd(prompt, workdir)
        rc, stdout, stderr, elapsed, to = self._run_subprocess(
            cmd, workdir, timeout_sec or self._default_timeout,
        )
        return self._build_result(workdir, rc, stdout, stderr, elapsed, to)


# ---------------------------------------------------------------------------
# Factory
# ---------------------------------------------------------------------------

def create_code_agent(
    config: RCConfig,
    llm: Any | None = None,
    prompts: Any | None = None,
) -> CodeAgentProvider:
    """Create the appropriate code agent based on config.experiment.cli_agent."""
    agent_cfg = config.experiment.cli_agent
    provider = agent_cfg.provider

    if provider == "llm":
        if llm is None:
            raise RuntimeError("LLM code agent requires an LLM client")
        from researchclaw.prompts import PromptManager

        return LlmCodeAgent(llm, prompts or PromptManager(), config)  # type: ignore[return-value]

    if provider == "claude_code":
        binary = agent_cfg.binary_path or shutil.which("claude")
        if not binary:
            raise RuntimeError(
                "Claude Code binary not found. "
                "Install it or set experiment.code_agent.binary_path."
            )
        return ClaudeCodeAgent(  # type: ignore[return-value]
            binary_path=binary,
            model=agent_cfg.model or "sonnet",
            max_budget_usd=agent_cfg.max_budget_usd,
            timeout_sec=agent_cfg.timeout_sec,
            extra_args=list(agent_cfg.extra_args),
        )

    if provider == "codex":
        binary = agent_cfg.binary_path or shutil.which("codex")
        if not binary:
            raise RuntimeError(
                "Codex binary not found. "
                "Install it or set experiment.code_agent.binary_path."
            )
        return CodexAgent(  # type: ignore[return-value]
            binary_path=binary,
            model=agent_cfg.model or "",
            max_budget_usd=agent_cfg.max_budget_usd,
            timeout_sec=agent_cfg.timeout_sec,
            extra_args=list(agent_cfg.extra_args),
        )

    raise ValueError(f"Unknown code agent provider: {provider}")

================================================
FILE: researchclaw/experiment/colab_sandbox.py
================================================
"""Google Drive-based async sandbox for Colab experiment execution.

Execution model:
  1. Write experiment code to a shared Google Drive folder (pending/)
  2. A Colab notebook polls pending/, runs each script, writes results to done/
  3. This sandbox polls done/ until results appear or timeout
  4. Parse metrics from the result file and return

This approach is more robust than direct SSH to Colab because:
  - No SSH tunnel to maintain
  - Colab session timeouts only kill the current experiment, not the pipeline
  - Google Drive sync handles reconnects transparently

Requirements:
  - Google Drive for Desktop installed and syncing (or any Drive mount)
  - A Colab notebook running the worker loop (template provided below)
"""

from __future__ import annotations

import json
import logging
import shutil
import time
import uuid
from pathlib import Path

from researchclaw.config import ColabDriveConfig
from researchclaw.experiment.sandbox import SandboxResult, parse_metrics

logger = logging.getLogger(__name__)

# Template for the Colab worker notebook
COLAB_WORKER_TEMPLATE = '''\
# === ResearchClaw Colab Worker ===
# Run this cell in Google Colab with GPU enabled.
# It polls Google Drive for experiment tasks and executes them.

import os, json, time, subprocess, traceback
from pathlib import Path
from google.colab import drive

drive.mount("/content/drive")

DRIVE_ROOT = Path("/content/drive/MyDrive/researchclaw")
PENDING = DRIVE_ROOT / "pending"
RUNNING = DRIVE_ROOT / "running"
DONE = DRIVE_ROOT / "done"

for d in [PENDING, RUNNING, DONE]:
    d.mkdir(parents=True, exist_ok=True)

print(f"Worker ready. Watching {PENDING}")
print("Press Ctrl+C or stop the cell to quit.\\n")

while True:
    for task_dir in sorted(PENDING.iterdir()):
        if not task_dir.is_dir():
            continue

        task_id = task_dir.name
        run_dir = RUNNING / task_id
        done_dir = DONE / task_id

        # Move to running
        task_dir.rename(run_dir)
        print(f"[{task_id}] Running...")

        # Run setup.sh if present
        setup_sh = run_dir / "setup.sh"
        if setup_sh.exists():
            subprocess.run(["bash", str(setup_sh)], cwd=str(run_dir),
                           capture_output=True, timeout=300)

        # Find entry point
        entry = run_dir / "main.py"
        if not entry.exists():
            # Try first .py file
            py_files = sorted(run_dir.glob("*.py"))
            entry = py_files[0] if py_files else None

        result = {"returncode": -1, "stdout": "", "stderr": "entry point not found"}

        if entry:
            try:
                cp = subprocess.run(
                    ["python3", "-u", str(entry)],
                    cwd=str(run_dir),
                    capture_output=True, text=True,
                    timeout=1800,  # 30 min max per experiment
                )
                result = {
                    "returncode": cp.returncode,
                    "stdout": cp.stdout,
                    "stderr": cp.stderr,
                }
            except subprocess.TimeoutExpired as e:
                result = {
                    "returncode": -1,
                    "stdout": (e.stdout or b"").decode("utf-8", errors="replace"),
                    "stderr": "Timed out after 1800s",
                    "timed_out": True,
                }
            except Exception:
                result = {
                    "returncode": -1,
                    "stdout": "",
                    "stderr": traceback.format_exc(),
                }

        # Write result and move to done
        (run_dir / "result.json").write_text(json.dumps(result))
        run_dir.rename(done_dir)
        print(f"[{task_id}] Done (exit {result['returncode']})")

    time.sleep(10)
'''


class ColabDriveSandbox:
    """Execute experiments asynchronously via Google Drive + Colab worker.

    Same public API as ExperimentSandbox/DockerSandbox/SshRemoteSandbox.
    """

    def __init__(self, config: ColabDriveConfig, workdir: Path) -> None:
        self.config = config
        self.workdir = workdir.resolve()
        self.workdir.mkdir(parents=True, exist_ok=True)
        self._run_counter = 0

        # Resolve drive root
        self.drive_root = Path(config.drive_root).expanduser().resolve()
        self.pending_dir = self.drive_root / "pending"
        self.done_dir = self.drive_root / "done"

    # ------------------------------------------------------------------
    # Public API (matches SandboxProtocol)
    # ------------------------------------------------------------------

    def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult:
        self._run_counter += 1
        task_id = f"rc-{uuid.uuid4().hex[:8]}"

        # Stage locally
        staging = self.workdir / f"_colab_{self._run_counter}"
        staging.mkdir(parents=True, exist_ok=True)
        (staging / "main.py").write_text(code, encoding="utf-8")

        self._inject_harness(staging)
        self._write_setup_script(staging)

        return self._submit_and_wait(staging, task_id, timeout_sec)

    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> SandboxResult:
        # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends
        from researchclaw.experiment.sandbox import validate_entry_point
        err = validate_entry_point(entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        self._run_counter += 1
        task_id = f"rc-{uuid.uuid4().hex[:8]}"

        staging = self.workdir / f"_colab_project_{self._run_counter}"
        if staging.exists():
            shutil.rmtree(staging)
        staging.mkdir(parents=True, exist_ok=True)

        self._inject_harness(staging)

        for src_item in project_dir.iterdir():
            dest = staging / src_item.name
            if dest.name == "experiment_harness.py":
                continue
            if src_item.is_dir():
                shutil.copytree(src_item, dest, dirs_exist_ok=True)
            elif src_item.is_file():
                dest.write_bytes(src_item.read_bytes())

        if not (staging / entry_point).exists():
            return SandboxResult(
                returncode=-1, stdout="",
                stderr=f"Entry point {entry_point} not found",
                elapsed_sec=0.0, metrics={},
            )
        # BUG-DA8-07: Check resolved path doesn't escape staging dir
        from researchclaw.experiment.sandbox import validate_entry_point_resolved
        err2 = validate_entry_point_resolved(staging, entry_point)
        if err2:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err2,
                elapsed_sec=0.0, metrics={},
            )

        self._write_setup_script(staging)
        return self._submit_and_wait(staging, task_id, timeout_sec)

    # ------------------------------------------------------------------
    # Static helpers
    # ------------------------------------------------------------------

    @staticmethod
    def check_drive_available(config: ColabDriveConfig) -> tuple[bool, str]:
        """Check if the Google Drive mount is accessible."""
        if not config.drive_root:
            return False, "colab_drive.drive_root is empty"
        root = Path(config.drive_root).expanduser().resolve()
        try:
            exists = root.exists()
        except OSError:
            exists = False
        if not exists:
            return False, (
                f"Drive root not found: {root}. "
                f"Is Google Drive for Desktop running and syncing?"
            )
        return True, f"Google Drive accessible at {root}"

    @staticmethod
    def write_worker_notebook(output_path: Path) -> None:
        """Write the Colab worker template to a file for the user to upload."""
        output_path.write_text(COLAB_WORKER_TEMPLATE, encoding="utf-8")
        logger.info("Colab worker template written to %s", output_path)

    @staticmethod
    def _inject_harness(target_dir: Path) -> None:
        harness_src = Path(__file__).parent / "harness_template.py"
        if harness_src.exists():
            dest = target_dir / "experiment_harness.py"
            dest.write_text(
                harness_src.read_text(encoding="utf-8"), encoding="utf-8"
            )

    def _write_setup_script(self, staging: Path) -> None:
        """Write setup.sh if setup_script is configured."""
        if self.config.setup_script:
            setup_path = staging / "setup.sh"
            setup_path.write_text(
                f"#!/bin/bash\nset -e\n{self.config.setup_script}\n",
                encoding="utf-8",
            )

    # ------------------------------------------------------------------
    # Core: submit task and poll for result
    # ------------------------------------------------------------------

    def _submit_and_wait(
        self, staging: Path, task_id: str, timeout_sec: int,
    ) -> SandboxResult:
        """Submit an experiment task and wait for the Colab worker to complete it.

        Protocol:
          1. Copy experiment files to ``pending/<task_id>/`` in the shared Drive
          2. The Colab worker notebook polls ``pending/``, moves task to
             ``running/``, executes it, writes ``result.json``, moves to ``done/``
          3. This method polls ``done/<task_id>/`` until result appears or timeout

        Google Drive sync latency (typically 5-30s) is handled by the
        configurable poll_interval_sec. If the worker never picks up the task,
        the pending directory is cleaned up on timeout.
        """
        # Ensure directories exist
        self.pending_dir.mkdir(parents=True, exist_ok=True)
        self.done_dir.mkdir(parents=True, exist_ok=True)

        # Copy task to pending/
        task_pending = self.pending_dir / task_id
        if task_pending.exists():
            shutil.rmtree(task_pending)
        shutil.copytree(staging, task_pending)
        logger.info("Task %s submitted to %s", task_id, self.pending_dir)

        # Poll done/ for result
        task_done = self.done_dir / task_id
        effective_timeout = max(timeout_sec, self.config.timeout_sec)
        poll_interval = self.config.poll_interval_sec

        start = time.monotonic()
        while time.monotonic() - start < effective_timeout:
            if task_done.exists():
                return self._collect_result(task_done, time.monotonic() - start)
            time.sleep(poll_interval)

        # Timeout — clean up pending task if still there
        elapsed = time.monotonic() - start
        if task_pending.exists():
            shutil.rmtree(task_pending)
        return SandboxResult(
            returncode=-1,
            stdout="",
            stderr=(
                f"Colab worker did not complete task {task_id} "
                f"within {effective_timeout}s. "
                f"Is the Colab worker notebook running?"
            ),
            elapsed_sec=elapsed,
            metrics={},
            timed_out=True,
        )

    def _collect_result(
        self, task_done: Path, elapsed: float,
    ) -> SandboxResult:
        """Read result.json from the done task directory and clean up.

        The result.json schema (written by the Colab worker):
          {"returncode": int, "stdout": str, "stderr": str, "timed_out"?: bool}

        Metrics are parsed from stdout using the same ``metric_name: value``
        format as the local and Docker sandboxes.
        """
        result_file = task_done / "result.json"
        if not result_file.exists():
            return SandboxResult(
                returncode=-1, stdout="",
                stderr="Colab worker did not write result.json",
                elapsed_sec=elapsed, metrics={},
            )

        try:
            data = json.loads(result_file.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError) as exc:
            return SandboxResult(
                returncode=-1, stdout="",
                stderr=f"Failed to read result.json: {exc}",
                elapsed_sec=elapsed, metrics={},
            )

        stdout = data.get("stdout", "")
        stderr = data.get("stderr", "")
        returncode = data.get("returncode", -1)
        timed_out = data.get("timed_out", False)

        metrics = parse_metrics(stdout)

        # Clean up
        shutil.rmtree(task_done, ignore_errors=True)

        return SandboxResult(
            returncode=returncode,
            stdout=stdout,
            stderr=stderr,
            elapsed_sec=elapsed,
            metrics=metrics,
            timed_out=timed_out,
        )


================================================
FILE: researchclaw/experiment/docker_sandbox.py
================================================
"""Docker-based sandbox for experiment code execution with GPU passthrough.

Uses a single-container, three-phase execution model:
  Phase 0: pip install from requirements.txt (if present)
  Phase 1: Run setup.py for dataset downloads (if present)
  Phase 2: Run the experiment script (main.py)

All phases run in the same container, so pip-installed packages
persist into the experiment phase. Network can be disabled after
setup via iptables (``setup_only`` policy).
"""

from __future__ import annotations

import json
import logging
import os
import re
import shutil
import subprocess
import threading
import time
from pathlib import Path

from researchclaw.config import DockerSandboxConfig
from researchclaw.experiment.sandbox import (
    SandboxResult,
    parse_metrics,
    validate_entry_point,
    validate_entry_point_resolved,
)

logger = logging.getLogger(__name__)

_CONTAINER_COUNTER = 0
_counter_lock = threading.Lock()


def _next_container_name() -> str:
    global _CONTAINER_COUNTER  # noqa: PLW0603
    with _counter_lock:
        _CONTAINER_COUNTER += 1
        return f"rc-exp-{_CONTAINER_COUNTER}-{os.getpid()}"


# Packages already in the Docker image — skip during auto-detect.
_BUILTIN_PACKAGES = {
    # PyTorch ecosystem
    "torch", "torchvision", "torchaudio", "torchdiffeq",
    # Scientific / ML
    "numpy", "scipy", "sklearn", "pandas", "matplotlib", "seaborn",
    "tqdm", "gymnasium", "networkx",
    # Extended ML ecosystem
    "timm", "einops", "torchmetrics", "albumentations", "kornia",
    "h5py", "tensorboard",
    # HuggingFace / LLM stack
    "transformers", "datasets", "accelerate", "peft", "trl",
    "bitsandbytes", "sentencepiece", "protobuf", "tokenizers",
    "safetensors", "evaluate",
    # Other pre-installed
    "yaml", "PIL", "mujoco",
    # Python stdlib
    "os", "sys", "math", "random", "json", "csv", "re", "time",
    "collections", "itertools", "functools", "pathlib", "typing",
    "dataclasses", "abc", "copy", "io", "logging", "argparse",
    "datetime", "hashlib", "pickle", "subprocess", "shutil",
    "tempfile", "warnings", "unittest", "contextlib", "operator",
    "string", "textwrap", "struct", "statistics", "glob",
    "urllib", "http", "email", "html", "xml",
}

# Map import names to pip package names.
_IMPORT_TO_PIP = {
    "torchdiffeq": "torchdiffeq",
    "torch_geometric": "torch-geometric",
    "torchvision": "torchvision",
    "torchaudio": "torchaudio",
    "cv2": "opencv-python",
    "PIL": "Pillow",
    "sklearn": "scikit-learn",
    "yaml": "PyYAML",
    "gym": "gymnasium",
    "ogb": "ogb",
    "dgl": "dgl",
    "lightning": "lightning",
    "pytorch_lightning": "pytorch-lightning",
    "wandb": "wandb",
    "optuna": "optuna",
}


class DockerSandbox:
    """Execute experiment code inside a Docker container.

    Same public API as :class:`ExperimentSandbox` so the pipeline can use
    either backend transparently.

    The container uses ``entrypoint.sh`` which runs three phases in sequence:
      0. ``pip install -r requirements.txt`` (if file present in /workspace)
      1. ``python3 setup.py`` (if file present in /workspace)
      2. ``python3 <entry_point>``

    Network policy controls when network is available:
      - ``"none"``:       No network at any point (``--network none``)
      - ``"setup_only"``: Network during Phase 0+1, disabled via iptables before Phase 2
      - ``"pip_only"``:   Network during Phase 0 only (legacy compat, same as setup_only)
      - ``"full"``:       Network available throughout all phases
    """

    def __init__(self, config: DockerSandboxConfig, workdir: Path) -> None:
        self.config = config
        self.workdir = workdir.resolve()
        self.workdir.mkdir(parents=True, exist_ok=True)
        self._run_counter = 0

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult:
        """Run a single Python code string inside a container."""
        self._run_counter += 1
        staging = self.workdir / f"_docker_run_{self._run_counter}"
        staging.mkdir(parents=True, exist_ok=True)

        script_path = staging / "main.py"
        script_path.write_text(code, encoding="utf-8")

        # Inject experiment harness
        self._inject_harness(staging)

        return self._execute(staging, entry_point="main.py", timeout_sec=timeout_sec)

    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> SandboxResult:
        """Run a multi-file experiment project inside a container."""
        self._run_counter += 1
        staging = self.workdir / f"_docker_project_{self._run_counter}"
        if staging.exists():
            shutil.rmtree(staging)
        staging.mkdir(parents=True, exist_ok=True)

        # Pre-copy syntax validation — fail fast before any I/O
        err = validate_entry_point(entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        # Inject harness first (immutable)
        self._inject_harness(staging)

        # Copy project files and subdirectories (skip harness overwrite)
        import shutil as _shutil
        for src_item in project_dir.iterdir():
            dest = staging / src_item.name
            if src_item.name == "experiment_harness.py":
                logger.warning(
                    "Project contains experiment_harness.py — skipping (immutable)"
                )
                continue
            if src_item.is_file():
                dest.write_bytes(src_item.read_bytes())
            elif src_item.is_dir() and not src_item.name.startswith((".", "__")):
                _shutil.copytree(src_item, dest, dirs_exist_ok=True)

        # Post-copy resolve check — catches symlink-based escapes
        err = validate_entry_point_resolved(staging, entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        entry = staging / entry_point
        if not entry.exists():
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Entry point {entry_point} not found in project",
                elapsed_sec=0.0,
                metrics={},
            )

        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)

    # ------------------------------------------------------------------
    # Static helpers
    # ------------------------------------------------------------------

    @staticmethod
    def check_docker_available() -> bool:
        """Return True if the Docker daemon is reachable."""
        try:
            cp = subprocess.run(
                ["docker", "info"],
                capture_output=True,
                timeout=10,
                check=False,
            )
            return cp.returncode == 0
        except (FileNotFoundError, subprocess.TimeoutExpired):
            return False

    @staticmethod
    def check_nvidia_runtime() -> bool:
        """Return True if the NVIDIA Container Toolkit is available."""
        try:
            cp = subprocess.run(
                ["docker", "run", "--rm", "--gpus", "all",
                 "nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04",
                 "nvidia-smi"],
                capture_output=True,
                timeout=30,
                check=False,
            )
            return cp.returncode == 0
        except (FileNotFoundError, subprocess.TimeoutExpired):
            return False

    @staticmethod
    def ensure_image(image: str) -> bool:
        """Return True if *image* exists locally (does NOT pull)."""
        try:
            cp = subprocess.run(
                ["docker", "image", "inspect", image],
                capture_output=True,
                timeout=10,
                check=False,
            )
            return cp.returncode == 0
        except (FileNotFoundError, subprocess.TimeoutExpired):
            return False

    @staticmethod
    def _inject_harness(target_dir: Path) -> None:
        harness_src = Path(__file__).parent / "harness_template.py"
        if harness_src.exists():
            dest = target_dir / "experiment_harness.py"
            dest.write_text(harness_src.read_text(encoding="utf-8"), encoding="utf-8")
            logger.debug("Injected experiment harness into %s", target_dir)
        else:
            logger.warning("Harness template not found at %s", harness_src)

    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------

    def _execute(
        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
    ) -> SandboxResult:
        """Core execution: single container, three-phase via entrypoint.sh."""
        cfg = self.config
        container_name = _next_container_name()

        # Auto-generate requirements.txt if packages need installing
        if cfg.network_policy in ("pip_only", "setup_only", "full"):
            self._write_requirements_txt(staging_dir)

        # Build the docker run command
        cmd = self._build_run_command(
            staging_dir,
            entry_point=entry_point,
            container_name=container_name,
        )

        start = time.monotonic()
        timed_out = False
        try:
            logger.debug("Docker run command: %s", cmd)
            completed = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=timeout_sec,
                check=False,
            )
            stdout = completed.stdout
            stderr = completed.stderr
            returncode = completed.returncode
            elapsed = time.monotonic() - start
        except subprocess.TimeoutExpired as exc:
            timed_out = True
            stdout = exc.stdout or ""
            stderr = exc.stderr or ""
            if isinstance(stdout, bytes):
                stdout = stdout.decode("utf-8", errors="replace")
            if isinstance(stderr, bytes):
                stderr = stderr.decode("utf-8", errors="replace")
            returncode = -1
            # Force-kill the container on timeout
            self._kill_container(container_name)
            elapsed = time.monotonic() - start
        except Exception as exc:  # noqa: BLE001
            elapsed = time.monotonic() - start
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Docker execution error: {exc}",
                elapsed_sec=elapsed,
                metrics={},
            )
        finally:
            # Always clean up the container regardless of how we exit.
            # docker rm -f is idempotent: safe even if container was
            # already removed by --rm, already dead, or never created.
            if not cfg.keep_containers:
                self._remove_container(container_name)

        # Parse metrics from stdout
        metrics = parse_metrics(stdout)

        # Try to read structured results.json from staging dir (volume-mounted)
        results_json_path = staging_dir / "results.json"
        if results_json_path.exists():
            try:
                structured = json.loads(
                    results_json_path.read_text(encoding="utf-8")
                )
                if isinstance(structured, dict):
                    for k, v in structured.items():
                        if k not in metrics:
                            try:
                                metrics[k] = float(v)
                            except (TypeError, ValueError):
                                pass
            except (json.JSONDecodeError, OSError):
                pass

        return SandboxResult(
            returncode=returncode,
            stdout=stdout,
            stderr=stderr,
            elapsed_sec=elapsed,
            metrics=metrics,
            timed_out=timed_out,
        )

    def _build_run_command(
        self,
        staging_dir: Path,
        *,
        entry_point: str,
        container_name: str,
    ) -> list[str]:
        """Build the ``docker run`` command list.

        The container uses ``entrypoint.sh`` which handles:
          Phase 0: pip install requirements.txt
          Phase 1: python3 setup.py
          Phase 2: python3 <entry_point>

        Network policy determines --network and RC_SETUP_ONLY_NETWORK env.
        """
        cfg = self.config
        cmd = [
            "docker", "run",
            "--name", container_name,
            "--rm",
            "-v", f"{staging_dir}:/workspace",
            "-w", "/workspace",
            f"--memory={cfg.memory_limit_mb}m",
            f"--shm-size={cfg.shm_size_mb}m",
        ]

        # --- Network policy ---
        if cfg.network_policy == "none":
            # Fully isolated — no network at any point
            cmd.extend(["--network", "none"])
            cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"])
        elif cfg.network_policy in ("setup_only", "pip_only"):
            # Network during Phase 0+1, disabled via iptables before Phase 2.
            # Run as host user so experiment can write results.json to volume.
            # iptables requires NET_ADMIN but will gracefully degrade if
            # the user lacks root — network remains available but the code
            # has already been validated by the pipeline security check.
            cmd.extend(["-e", "RC_SETUP_ONLY_NETWORK=1"])
            cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"])
            cmd.extend(["--cap-add=NET_ADMIN"])
        elif cfg.network_policy == "full":
            # Full network throughout — for development/debugging
            cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"])

        # Mount pre-cached datasets
        # Priority: /opt/datasets (system) > ~/.cache/datasets (user)
        datasets_host = Path("/opt/datasets")
        user_datasets = Path.home() / ".cache" / "datasets"
        if datasets_host.is_dir():
            cmd.extend(["-v", f"{datasets_host}:/workspace/data:ro"])
        elif user_datasets.is_dir():
            cmd.extend(["-v", f"{user_datasets}:/workspace/data:rw"])
        else:
            # Create user-level cache so containers can download datasets
            user_datasets.mkdir(parents=True, exist_ok=True)
            cmd.extend(["-v", f"{user_datasets}:/workspace/data:rw"])

        # Mount HuggingFace model cache (read-only for model weights).
        # BUG-103 fix: Don't set HF_HOME to the read-only mount — the
        # transformers library writes token/telemetry files under HF_HOME.
        # Instead, use HF_HUB_CACHE for read-only model access and let
        # HF_HOME default to a writable location inside the container.
        hf_mounted = False
        _hf_hub_cache = "/home/researcher/.cache/huggingface/hub"
        hf_home_env = os.environ.get("HF_HOME", "").strip()
        if hf_home_env:
            xdg_hf = Path(hf_home_env).resolve()
            if xdg_hf.is_dir():
                cmd.extend(["-v", f"{xdg_hf}:{_hf_hub_cache}:ro"])
                cmd.extend(["-e", f"HF_HUB_CACHE={_hf_hub_cache}"])
                hf_mounted = True
        if not hf_mounted:
            hf_cache_host = Path.home() / ".cache" / "huggingface"
            if hf_cache_host.is_dir():
                cmd.extend(["-v", f"{hf_cache_host}:{_hf_hub_cache}:ro"])
                cmd.extend(["-e", f"HF_HUB_CACHE={_hf_hub_cache}"])

        # BUG-107 fix: Set TORCH_HOME to writable location so torchvision
        # can download pretrained model weights (e.g., Inception-v3 for FID).
        cmd.extend(["-e", "TORCH_HOME=/workspace/.cache/torch"])

        # BUG-R52-03: Set HOME to a writable directory.  The container runs
        # as the host user (--user UID:GID) whose HOME defaults to "/" when
        # no matching passwd entry exists.  pip --user then fails with
        # "Permission denied: '/.local'".
        cmd.extend(["-e", "HOME=/workspace/.home"])

        # Pass HF token if available (for gated model downloads)
        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
        if hf_token:
            cmd.extend(["-e", f"HF_TOKEN={hf_token}"])

        # GPU passthrough
        if cfg.gpu_enabled:
            if cfg.gpu_device_ids:
                device_spec = ",".join(str(d) for d in cfg.gpu_device_ids)
                cmd.extend(["--gpus", f"device={device_spec}"])
            else:
                cmd.extend(["--gpus", "all"])

        # Image + entry point (passed as CMD arg to entrypoint.sh)
        cmd.append(cfg.image)
        cmd.append(entry_point)

        return cmd

    def _write_requirements_txt(self, staging_dir: Path) -> None:
        """Generate requirements.txt in staging dir from auto-detected imports
        and explicit pip_pre_install, unless one already exists (LLM-generated).
        """
        req_path = staging_dir / "requirements.txt"

        # If the LLM already generated a requirements.txt, respect it but
        # append any pip_pre_install packages not already listed.
        existing_reqs: set[str] = set()
        if req_path.exists():
            for line in req_path.read_text(encoding="utf-8").splitlines():
                line = line.strip()
                if line and not line.startswith("#"):
                    # Extract package name (before any version specifier)
                    pkg = re.split(r"[><=!~\[]", line)[0].strip().lower()
                    existing_reqs.add(pkg)

        # Collect additional packages to install
        packages: list[str] = []

        # From config pip_pre_install
        for pkg in self.config.pip_pre_install:
            pkg_base = re.split(r"[><=!~\[]", pkg)[0].strip().lower()
            if pkg_base not in existing_reqs:
                packages.append(pkg)
                existing_reqs.add(pkg_base)

        # Auto-detect from imports
        if self.config.auto_install_deps:
            detected = self._detect_pip_packages(staging_dir)
            for pkg in detected:
                pkg_base = pkg.lower()
                if pkg_base not in existing_reqs:
                    packages.append(pkg)
                    existing_reqs.add(pkg_base)

        if not packages and not req_path.exists():
            return  # Nothing to install

        if packages:
            mode = "a" if req_path.exists() else "w"
            with open(req_path, mode, encoding="utf-8") as f:
                if mode == "a":
                    f.write("\n# Auto-detected by ResearchClaw\n")
                for pkg in packages:
                    f.write(pkg + "\n")
            logger.info("requirements.txt updated: %s", packages)

    @staticmethod
    def _detect_pip_packages(staging_dir: Path) -> list[str]:
        """Scan Python files for import statements and return pip package names."""
        import_re = re.compile(
            r"^\s*(?:import|from)\s+([\w.]+)", re.MULTILINE
        )
        # Exclude local project modules (any .py file in staging_dir, recursive)
        # BUG-DA8-13: Use rglob to also scan subdirectories
        local_modules = {
            pyf.stem for pyf in staging_dir.rglob("*.py")
        }
        detected: list[str] = []
        for pyf in staging_dir.rglob("*.py"):
            if pyf.name == "setup.py":
                continue  # Don't scan setup.py for experiment deps
            text = pyf.read_text(encoding="utf-8", errors="replace")
            for m in import_re.finditer(text):
                top_module = m.group(1).split(".")[0]
                if top_module in _BUILTIN_PACKAGES:
                    continue
                if top_module in local_modules:
                    continue  # Skip local project modules
                pip_name = _IMPORT_TO_PIP.get(top_module, top_module)
                if pip_name not in detected:
                    detected.append(pip_name)

        return detected

    @staticmethod
    def _kill_container(name: str) -> None:
        try:
            subprocess.run(
                ["docker", "kill", name],
                capture_output=True,
                timeout=10,
                check=False,
            )
        except (FileNotFoundError, subprocess.TimeoutExpired):
            pass

    @staticmethod
    def _remove_container(name: str) -> None:
        try:
            subprocess.run(
                ["docker", "rm", "-f", name],
                capture_output=True,
                timeout=10,
                check=False,
            )
        except (FileNotFoundError, subprocess.TimeoutExpired):
            pass


================================================
FILE: researchclaw/experiment/evaluators/__init__.py
================================================
"""Domain-specific experiment evaluators."""


================================================
FILE: researchclaw/experiment/evaluators/convergence.py
================================================
"""Convergence study evaluator for physics/math domains.

Analyzes convergence data (error vs grid size/timestep) to determine
convergence order and quality of numerical methods.
"""

from __future__ import annotations

import logging
import math
from dataclasses import dataclass, field
from typing import Any

import numpy as np

logger = logging.getLogger(__name__)


@dataclass
class ConvergenceResult:
    """Result of convergence analysis for one method."""
    method: str
    convergence_order: float = 0.0
    r_squared: float = 0.0
    points: list[dict[str, float]] = field(default_factory=list)
    is_converging: bool = False
    expected_order: float | None = None  # if known
    order_matches_expected: bool = False


@dataclass
class ConvergenceReport:
    """Full convergence analysis report."""
    methods: list[ConvergenceResult] = field(default_factory=list)
    best_method: str = ""
    summary: str = ""


def compute_convergence_order(
    h_values: list[float],
    errors: list[float],
) -> tuple[float, float]:
    """Compute convergence order via log-log linear regression.

    Parameters
    ----------
    h_values : list[float]
        Grid sizes / timesteps (must be decreasing).
    errors : list[float]
        Error norms corresponding to each h value.

    Returns
    -------
    order : float
        Estimated convergence order (slope in log-log space).
    r_squared : float
        R² of the log-log fit.
    """
    if len(h_values) < 2 or len(errors) < 2:
        return 0.0, 0.0

    # Filter out non-positive values
    valid = [
        (h, e) for h, e in zip(h_values, errors)
        if h > 0 and e > 0 and math.isfinite(h) and math.isfinite(e)
    ]
    if len(valid) < 2:
        return 0.0, 0.0

    hs, es = zip(*valid)
    log_h = np.log(np.array(hs, dtype=np.float64))
    log_e = np.log(np.array(es, dtype=np.float64))

    # Linear regression: log(e) = p * log(h) + C
    n = len(log_h)
    sum_x = np.sum(log_h)
    sum_y = np.sum(log_e)
    sum_xy = np.sum(log_h * log_e)
    sum_x2 = np.sum(log_h ** 2)

    denom = n * sum_x2 - sum_x ** 2
    if abs(denom) < 1e-15:
        return 0.0, 0.0

    slope = (n * sum_xy - sum_x * sum_y) / denom
    intercept = (sum_y - slope * sum_x) / n

    # R²
    y_pred = slope * log_h + intercept
    ss_res = np.sum((log_e - y_pred) ** 2)
    ss_tot = np.sum((log_e - np.mean(log_e)) ** 2)
    r_squared = 1.0 - ss_res / ss_tot if ss_tot > 1e-15 else 0.0

    return float(slope), float(r_squared)


def analyze_convergence(
    convergence_data: dict[str, list[dict[str, float]]],
    expected_orders: dict[str, float] | None = None,
) -> ConvergenceReport:
    """Analyze convergence data for multiple methods.

    Parameters
    ----------
    convergence_data : dict
        Maps method name to list of {"h": ..., "error": ...} dicts.
    expected_orders : dict, optional
        Known convergence orders per method (for validation).

    Returns
    -------
    ConvergenceReport
    """
    results: list[ConvergenceResult] = []

    for method, points in convergence_data.items():
        if not points:
            continue

        # Sort by h (descending — coarsest first)
        sorted_pts = sorted(points, key=lambda p: p.get("h", 0), reverse=True)

        h_vals = [p["h"] for p in sorted_pts if "h" in p]
        # Try "error", "l2_error", "linf_error"
        error_key = "error"
        for key in ("error", "l2_error", "linf_error"):
            if key in sorted_pts[0]:
                error_key = key
                break

        errors = [p.get(error_key, 0) for p in sorted_pts]

        order, r2 = compute_convergence_order(h_vals, errors)

        expected = None
        matches = False
        if expected_orders and method in expected_orders:
            expected = expected_orders[method]
            matches = abs(order - expected) < 0.5  # within half an order

        is_converging = order > 0.5 and r2 > 0.8

        results.append(ConvergenceResult(
            method=method,
            convergence_order=order,
            r_squared=r2,
            points=sorted_pts,
            is_converging=is_converging,
            expected_order=expected,
            order_matches_expected=matches,
        ))

    # Find best method (highest convergence order)
    best = ""
    if results:
        best_result = max(results, key=lambda r: r.convergence_order)
        best = best_result.method

    # Generate summary
    summary_lines = []
    for r in results:
        line = f"{r.method}: order={r.convergence_order:.2f} (R²={r.r_squared:.3f})"
        if r.expected_order is not None:
            status = "✓" if r.order_matches_expected else "✗"
            line += f" [expected={r.expected_order:.1f} {status}]"
        summary_lines.append(line)

    return ConvergenceReport(
        methods=results,
        best_method=best,
        summary="\n".join(summary_lines),
    )


================================================
FILE: researchclaw/experiment/factory.py
================================================
"""Factory for creating sandbox backends based on experiment config."""

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING

from researchclaw.config import ExperimentConfig
from researchclaw.experiment.sandbox import ExperimentSandbox, SandboxProtocol

if TYPE_CHECKING:
    from researchclaw.experiment.agentic_sandbox import AgenticSandbox

logger = logging.getLogger(__name__)


def create_sandbox(config: ExperimentConfig, workdir: Path) -> SandboxProtocol:
    """Return the appropriate sandbox backend for *config.mode*.

    - ``"sandbox"`` → :class:`ExperimentSandbox` (subprocess)
    - ``"docker"``  → :class:`DockerSandbox`  (Docker container)
    """
    if config.mode == "docker":
        from researchclaw.experiment.docker_sandbox import DockerSandbox

        docker_cfg = config.docker

        if not DockerSandbox.check_docker_available():
            logger.warning(
                "Docker daemon is not reachable — "
                "falling back to subprocess sandbox."
            )
            return ExperimentSandbox(config.sandbox, workdir)

        if not DockerSandbox.ensure_image(docker_cfg.image):
            raise RuntimeError(
                f"Docker image '{docker_cfg.image}' not found locally. "
                f"Build it: docker build -t {docker_cfg.image} researchclaw/docker/"
            )

        if docker_cfg.gpu_enabled:
            logger.info("Docker sandbox: GPU passthrough enabled")

        return DockerSandbox(docker_cfg, workdir)

    if config.mode == "ssh_remote":
        from researchclaw.experiment.ssh_sandbox import SshRemoteSandbox

        ssh_cfg = config.ssh_remote
        if not ssh_cfg.host:
            raise RuntimeError(
                "ssh_remote mode requires experiment.ssh_remote.host in config."
            )

        ok, msg = SshRemoteSandbox.check_ssh_available(ssh_cfg)
        if not ok:
            raise RuntimeError(f"SSH connectivity check failed: {msg}")

        logger.info("SSH remote sandbox: %s", msg)
        return SshRemoteSandbox(ssh_cfg, workdir)

    if config.mode == "colab_drive":
        from researchclaw.experiment.colab_sandbox import ColabDriveSandbox

        colab_cfg = config.colab_drive
        ok, msg = ColabDriveSandbox.check_drive_available(colab_cfg)
        if not ok:
            raise RuntimeError(f"Colab Drive check failed: {msg}")

        logger.info("Colab Drive sandbox: %s", msg)

        # Write worker template for user convenience
        worker_path = Path(colab_cfg.drive_root).expanduser() / "colab_worker.py"
        if not worker_path.exists():
            ColabDriveSandbox.write_worker_notebook(worker_path)
            logger.info(
                "Colab worker template written to %s — "
                "upload this to Colab and run it.",
                worker_path,
            )

        return ColabDriveSandbox(colab_cfg, workdir)

    if config.mode != "sandbox":
        raise RuntimeError(
            f"Unsupported experiment mode for create_sandbox(): {config.mode}"
        )

    return ExperimentSandbox(config.sandbox, workdir)


def create_agentic_sandbox(
    config: ExperimentConfig,
    workdir: Path,
    skills_dir: Path | None = None,
) -> "AgenticSandbox":  # noqa: F821
    """Return an :class:`AgenticSandbox` for agentic experiment mode.

    Validates that Docker is available before returning.
    """
    from researchclaw.experiment.agentic_sandbox import AgenticSandbox

    if not AgenticSandbox.check_docker_available():
        raise RuntimeError(
            "Docker daemon is not reachable. "
            "Agentic mode requires Docker. Start Docker first."
        )

    agentic_cfg = config.agentic
    if agentic_cfg.gpu_enabled:
        logger.info("Agentic sandbox: GPU passthrough enabled")

    return AgenticSandbox(agentic_cfg, workdir, skills_dir=skills_dir)


================================================
FILE: researchclaw/experiment/git_manager.py
================================================
"""Git-native experiment version management inspired by autoresearch."""

from __future__ import annotations

import json
import logging
import re
import subprocess
from pathlib import Path

logger = logging.getLogger(__name__)


class ExperimentGitManager:
    """Git-native experiment versioning, inspired by autoresearch.

    Every successful experiment is a commit; failed experiments are reset.
    This enables git log as an experiment journal and easy rollback.
    """

    def __init__(self, repo_dir: Path) -> None:
        self.repo_dir: Path = repo_dir
        self._active_branch: str | None = None
        self._original_branch: str | None = self._detect_current_branch()

    def create_experiment_branch(self, tag: str) -> str:
        branch = f"experiment/{tag}"
        result = self._run_git(["checkout", "-b", branch])
        if result is None or result.returncode != 0:
            self._log_git_failure("create_experiment_branch", result)
            return ""
        self._active_branch = branch
        return branch

    def commit_experiment(
        self, run_id: str, metrics: dict[str, object], description: str
    ) -> str:
        add_result = self._run_git(["add", "-A"])
        if add_result is None or add_result.returncode != 0:
            self._log_git_failure("git add", add_result)
            return ""

        message = self._format_commit_message(
            run_id=run_id, metrics=metrics, description=description
        )
        commit_result = self._run_git(["commit", "-m", message])
        if commit_result is None or commit_result.returncode != 0:
            self._log_git_failure("git commit", commit_result)
            return ""

        hash_result = self._run_git(["rev-parse", "HEAD"])
        if hash_result is None or hash_result.returncode != 0:
            self._log_git_failure("git rev-parse HEAD", hash_result)
            return ""
        return self._clean_output(hash_result.stdout)

    def discard_experiment(self, run_id: str, reason: str) -> bool:
        logger.info("Discarding experiment %s: %s", run_id, reason)
        result = self._run_git(["reset", "--hard", "HEAD"])
        if result is None or result.returncode != 0:
            self._log_git_failure("discard_experiment", result)
            return False
        return True

    def get_experiment_history(self) -> list[dict[str, str]]:
        result = self._run_git(["log", "--oneline", "--fixed-strings", "--grep", "experiment("])
        if result is None or result.returncode != 0:
            self._log_git_failure("git log", result)
            return []

        history: list[dict[str, str]] = []
        for line in result.stdout.splitlines():
            parsed = self._parse_experiment_log_line(line)
            if parsed is not None:
                history.append(parsed)
        return history

    def is_git_repo(self) -> bool:
        """Check whether repo_dir is inside a git repository."""
        result = self._run_git(["rev-parse", "--is-inside-work-tree"])
        return result is not None and result.returncode == 0

    def get_current_branch(self) -> str:
        """Return the name of the current branch, or '' on failure."""
        name = self._detect_current_branch()
        return name or ""

    def return_to_original_branch(self) -> bool:
        """Switch back to the branch that was active when the manager was created."""
        if not self._original_branch:
            return False
        result = self._run_git(["checkout", self._original_branch])
        if result is None or result.returncode != 0:
            self._log_git_failure("return_to_original_branch", result)
            return False
        self._active_branch = self._original_branch
        return True

    def get_experiment_diff(self) -> str:
        """Return the git diff of uncommitted changes (for logging/debugging)."""
        result = self._run_git(["diff", "--stat"])
        if result is None or result.returncode != 0:
            return ""
        return result.stdout.strip()

    def clean_untracked(self) -> bool:
        """Remove untracked files in the experiment workspace."""
        result = self._run_git(["clean", "-fd"])
        return result is not None and result.returncode == 0

    def _run_git(self, args: list[str]) -> subprocess.CompletedProcess[str] | None:
        try:
            logger.debug("Running git command: git %s", " ".join(args))
            return subprocess.run(
                ["git", *args],
                cwd=self.repo_dir,
                capture_output=True,
                text=True,
                check=False,
            )
        except Exception as exc:  # noqa: BLE001
            logger.warning("Git operation failed (%s): %s", " ".join(args), exc)
            return None

    @staticmethod
    def _format_commit_message(
        *, run_id: str, metrics: dict[str, object], description: str
    ) -> str:
        metrics_json = json.dumps(metrics, sort_keys=True)
        return f"experiment({run_id}): {description}\n\nMetrics: {metrics_json}"

    @staticmethod
    def _clean_output(output: str) -> str:
        return output.strip()

    @staticmethod
    def _parse_experiment_log_line(line: str) -> dict[str, str] | None:
        pattern = re.compile(r"^([0-9a-fA-F]+)\s+experiment\(([^)]+)\):\s*(.*)$")
        match = pattern.match(line.strip())
        if match is None:
            return None
        commit_hash, run_id, message = match.groups()
        return {"hash": commit_hash, "run_id": run_id, "message": message}

    @staticmethod
    def _log_git_failure(
        operation: str, result: subprocess.CompletedProcess[str] | None
    ) -> None:
        if result is None:
            logger.warning("Git operation failed for %s", operation)
            return
        stderr = result.stderr.strip()
        if stderr:
            logger.warning("Git operation failed for %s: %s", operation, stderr)
        else:
            logger.warning(
                "Git operation failed for %s with code %s", operation, result.returncode
            )

    def _detect_current_branch(self) -> str | None:
        """Detect the current git branch name, or None if not in a repo."""
        result = self._run_git(["rev-parse", "--abbrev-ref", "HEAD"])
        if result is None or result.returncode != 0:
            return None
        name = result.stdout.strip()
        return name if name else None


================================================
FILE: researchclaw/experiment/harness_template.py
================================================
"""Experiment harness — immutable evaluation infrastructure.

This file is injected into the sandbox project directory at execution time.
The LLM-generated experiment code should import and use this harness for:
- Time budget management (should_stop)
- Metric reporting (report_metric)
- Result finalization (finalize)
- NaN/divergence detection (built-in)

This file is NOT editable by the LLM agent — it provides a trust boundary
for metric reporting, inspired by karpathy/autoresearch's immutable prepare.py.
"""

import json
import math
import sys
import time


class ExperimentHarness:
    """Immutable experiment infrastructure for time and metric management."""

    def __init__(self, time_budget: int = 120):
        self._start = time.time()
        self._time_budget = max(1, int(time_budget))
        self._metrics: dict[str, float] = {}
        self._partial_results: list[dict[str, object]] = []
        self._step_count = 0
        self._nan_count = 0

    @property
    def elapsed(self) -> float:
        """Seconds elapsed since harness creation."""
        return time.time() - self._start

    @property
    def progress(self) -> float:
        """Fraction of time budget used (0.0 to 1.0)."""
        return min(self.elapsed / self._time_budget, 1.0)

    def should_stop(self) -> bool:
        """Return True if approaching 80% of time budget."""
        return self.elapsed >= self._time_budget * 0.8

    def check_value(self, value: float, name: str = "metric") -> bool:
        """Return True if value is finite. Log warning and count NaN/Inf."""
        if math.isnan(value) or math.isinf(value):
            self._nan_count += 1
            print(
                f"WARNING: {name} = {value} (non-finite, skipped)",
                file=sys.stderr,
            )
            if self._nan_count >= 5:
                print(
                    "FAIL: Too many NaN/Inf values detected. "
                    "Stopping experiment early.",
                    file=sys.stderr,
                )
                self.finalize()
                sys.exit(1)
            return False
        return True

    def report_metric(self, name: str, value: float) -> None:
        """Report a metric value. Validates and prints in standard format.

        Non-finite values (NaN, Inf) are rejected and logged as warnings.
        """
        if not isinstance(value, (int, float)):
            try:
                value = float(value)
            except (TypeError, ValueError):
                print(f"WARNING: Cannot convert {name}={value!r} to float", file=sys.stderr)
                return

        if not self.check_value(value, name):
            return

        self._metrics[name] = value
        # Standard format recognized by sandbox metric parser
        print(f"{name}: {value}")

    def log_result(self, result_dict: dict[str, object]) -> None:
        """Log a structured result row (e.g., per-condition results)."""
        self._partial_results.append(result_dict)

    def finalize(self) -> None:
        """Write results.json with all reported metrics and partial results."""
        output = {
            "metrics": self._metrics,
            "elapsed_sec": round(self.elapsed, 2),
            "time_budget_sec": self._time_budget,
            "steps_completed": self._step_count,
            "nan_count": self._nan_count,
        }
        if self._partial_results:
            output["results"] = self._partial_results

        try:
            with open("results.json", "w", encoding="utf-8") as f:
                json.dump(output, f, indent=2, default=str)
        except OSError as e:
            print(f"WARNING: Could not write results.json: {e}", file=sys.stderr)

    def step(self) -> None:
        """Increment step counter. Call this once per experiment step."""
        self._step_count += 1


# Convenience: create a default harness when imported
_default_harness: ExperimentHarness | None = None


def get_harness(time_budget: int = 120) -> ExperimentHarness:
    """Get or create the default experiment harness."""
    global _default_harness  # noqa: PLW0603
    if _default_harness is None:
        _default_harness = ExperimentHarness(time_budget)
    return _default_harness


================================================
FILE: researchclaw/experiment/metrics.py
================================================
"""Universal metric parser — supports JSON, CSV, and stdout regex formats.

Parse priority:
  1. ``results.json`` — structured JSON output (recommended for all domains)
  2. ``results.csv`` — tabular output
  3. stdout regex — backward-compatible with existing ``metric: value`` format

This module extends (not replaces) the existing ``sandbox.parse_metrics``
function. The existing stdout parser remains the fallback.
"""

from __future__ import annotations

import csv
import json
import logging
import math
from dataclasses import dataclass, field
from enum import Enum
from io import StringIO
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


class MetricType(str, Enum):
    SCALAR = "scalar"
    TABLE = "table"
    CONVERGENCE = "convergence"
    LEARNING_CURVE = "learning_curve"
    CONFUSION_MATRIX = "confusion"
    STRUCTURED = "structured"
    PARETO = "pareto"


@dataclass
class ExperimentResults:
    """Unified experiment results container.

    Works for all domains — ML scalar metrics, physics convergence data,
    economics regression tables, etc.
    """

    # Flat scalar metrics (backward-compatible with existing pipeline)
    scalars: dict[str, float] = field(default_factory=dict)

    # Per-condition results (new universal format)
    conditions: dict[str, dict[str, Any]] = field(default_factory=dict)

    # Convergence data (for physics/math domains)
    convergence: dict[str, list[dict[str, float]]] = field(default_factory=dict)

    # Regression tables (for economics)
    regression_table: dict[str, dict[str, Any]] = field(default_factory=dict)

    # Full structured data (raw JSON)
    structured: dict[str, Any] = field(default_factory=dict)

    # Metadata
    experiment_type: str = ""
    domain: str = ""
    total_runtime_sec: float = 0.0
    source: str = ""  # "json" | "csv" | "stdout"

    def to_flat_metrics(self) -> dict[str, float]:
        """Convert to flat metric dict for backward compatibility.

        The existing pipeline expects dict[str, float] from parse_metrics().
        This method flattens all result types into that format.
        """
        metrics: dict[str, float] = dict(self.scalars)

        # Flatten conditions
        for cond_name, seeds in self.conditions.items():
            if isinstance(seeds, dict):
                for seed_or_metric, value in seeds.items():
                    if isinstance(value, dict):
                        for metric_name, metric_val in value.items():
                            if isinstance(metric_val, (int, float)) and math.isfinite(metric_val):
                                metrics[f"{cond_name}/{metric_name}"] = float(metric_val)
                    elif isinstance(value, (int, float)) and math.isfinite(value):
                        metrics[f"{cond_name}/{seed_or_metric}"] = float(value)

        # Flatten convergence (take final/best error per method)
        for method, points in self.convergence.items():
            if points:
                last = points[-1]
                for key, val in last.items():
                    if key != "h" and isinstance(val, (int, float)) and math.isfinite(val):
                        metrics[f"{method}/{key}"] = float(val)

        # Flatten regression table
        for spec, coeffs in self.regression_table.items():
            if isinstance(coeffs, dict):
                for key, val in coeffs.items():
                    if isinstance(val, (int, float)) and math.isfinite(val):
                        metrics[f"{spec}/{key}"] = float(val)

        return metrics


class UniversalMetricParser:
    """Parse experiment results from multiple output formats.

    Usage::

        parser = UniversalMetricParser()
        results = parser.parse(run_dir)
        flat = results.to_flat_metrics()  # backward-compatible
    """

    def parse(self, run_dir: Path, stdout: str = "") -> ExperimentResults:
        """Parse experiment results from a run directory.

        Tries formats in order: JSON → CSV → stdout regex.
        """
        # 1. Try JSON
        results_json = run_dir / "results.json"
        if results_json.exists():
            try:
                result = self._parse_json(results_json)
                if result.scalars or result.conditions or result.convergence or result.regression_table:
                    logger.info("Parsed results from results.json")
                    return result
            except Exception:
                logger.warning("Failed to parse results.json", exc_info=True)

        # 2. Try CSV
        results_csv = run_dir / "results.csv"
        if results_csv.exists():
            try:
                result = self._parse_csv(results_csv)
                if result.source == "csv":
                    logger.info("Parsed results from results.csv")
                    return result
            except Exception:
                logger.warning("Failed to parse results.csv", exc_info=True)

        # 3. Fallback: stdout regex (existing behavior)
        if stdout:
            return self._parse_stdout(stdout)

        # Try reading stdout.log from run_dir
        stdout_log = run_dir / "stdout.log"
        if stdout_log.exists():
            try:
                stdout_text = stdout_log.read_text(encoding="utf-8", errors="replace")
                return self._parse_stdout(stdout_text)
            except Exception:
                logger.warning("Failed to read stdout.log", exc_info=True)

        return ExperimentResults(source="none")

    def _parse_json(self, path: Path) -> ExperimentResults:
        """Parse structured JSON results."""
        with path.open(encoding="utf-8") as fh:
            data = json.load(fh)

        if not isinstance(data, dict):
            return ExperimentResults(source="json")

        result = ExperimentResults(
            source="json",
            experiment_type=data.get("experiment_type", ""),
            structured=data,
        )

        # Extract metadata
        meta = data.get("metadata", {})
        if isinstance(meta, dict):
            result.domain = meta.get("domain", "")
            result.total_runtime_sec = float(meta.get("total_runtime_sec", 0))

        # Extract conditions (comparison experiments)
        conditions = data.get("conditions", {})
        if isinstance(conditions, dict):
            result.conditions = conditions
            # Also extract scalar metrics for backward compatibility
            for cond_name, seeds in conditions.items():
                if isinstance(seeds, dict):
                    for seed_key, metrics in seeds.items():
                        if isinstance(metrics, dict):
                            for metric_name, val in metrics.items():
                                if isinstance(val, (int, float)) and math.isfinite(val):
                                    result.scalars[f"{cond_name}/{metric_name}"] = float(val)
                                    result.scalars[metric_name] = float(val)
                        elif isinstance(metrics, (int, float)) and math.isfinite(metrics):
                            result.scalars[f"{cond_name}/{seed_key}"] = float(metrics)

        # Extract convergence data
        convergence = data.get("convergence", {})
        if isinstance(convergence, dict):
            result.convergence = convergence

        # Extract regression table
        reg_table = data.get("regression_table", {})
        if isinstance(reg_table, dict):
            result.regression_table = reg_table

        # Top-level scalar metrics
        for key, val in data.items():
            if key not in ("conditions", "convergence", "regression_table", "metadata", "experiment_type"):
                if isinstance(val, (int, float)) and math.isfinite(val):
                    result.scalars[key] = float(val)

        return result

    def _parse_csv(self, path: Path) -> ExperimentResults:
        """Parse CSV results (one row per condition/seed/metric)."""
        text = path.read_text(encoding="utf-8", errors="replace")
        reader = csv.DictReader(StringIO(text))

        result = ExperimentResults(source="csv")
        rows_processed = 0

        for row in reader:
            rows_processed += 1
            # Expected columns: condition, seed, metric, value
            # Or: method, h, error (for convergence)
            cond = row.get("condition", row.get("method", ""))
            metric = row.get("metric", "")
            value_str = row.get("value", row.get("error", ""))

            try:
                val = float(value_str)
            except (ValueError, TypeError):
                continue

            if not math.isfinite(val):
                continue

            if metric:
                key = f"{cond}/{metric}" if cond else metric
                result.scalars[key] = val
            elif cond:
                # Convergence-style: method, h, error
                h_str = row.get("h", "")
                try:
                    h = float(h_str)
                except (ValueError, TypeError):
                    continue
                if cond not in result.convergence:
                    result.convergence[cond] = []
                result.convergence[cond].append({"h": h, "error": val})

        # Mark as CSV source if we processed any rows (even if no valid data)
        if rows_processed == 0:
            result.source = "none"

        return result

    def _parse_stdout(self, stdout: str) -> ExperimentResults:
        """Parse stdout using the existing regex-based parser.

        Delegates to ``sandbox.parse_metrics`` for backward compatibility.
        """
        from researchclaw.experiment.sandbox import parse_metrics

        metrics = parse_metrics(stdout)
        return ExperimentResults(
            scalars={k: float(v) for k, v in metrics.items() if isinstance(v, (int, float))},
            source="stdout",
        )


================================================
FILE: researchclaw/experiment/runner.py
================================================
"""Experiment execution engine inspired by autoresearch's edit→run→eval→keep/discard loop."""

from __future__ import annotations

import json
import logging
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Protocol, cast

from researchclaw.config import ExperimentConfig, SandboxConfig, SshRemoteConfig
from researchclaw.experiment.factory import create_sandbox
from researchclaw.experiment.sandbox import SandboxProtocol

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class ExperimentResult:
    run_id: str
    iteration: int
    code: str
    metrics: dict[str, object]
    primary_metric: float | None
    improved: bool
    kept: bool
    elapsed_sec: float
    stdout: str
    stderr: str
    error: str | None = None


@dataclass
class ExperimentHistory:
    results: list[ExperimentResult] = field(default_factory=list)
    best_result: ExperimentResult | None = None
    baseline_metric: float | None = None

    def add(self, result: ExperimentResult) -> None:
        self.results.append(result)
        if self.baseline_metric is None and result.primary_metric is not None:
            self.baseline_metric = result.primary_metric

    def to_dict(self) -> dict[str, object]:
        return {
            "results": [asdict(result) for result in self.results],
            "best_result": asdict(self.best_result) if self.best_result else None,
            "baseline_metric": self.baseline_metric,
        }

    @classmethod
    def from_dict(cls, data: dict[str, object]) -> ExperimentHistory:
        results: list[ExperimentResult] = []
        raw_results = data.get("results")
        if isinstance(raw_results, list):
            for item in cast(list[object], raw_results):
                if isinstance(item, dict):
                    item_map = cast(dict[object, object], item)
                    normalized_item: dict[str, object] = {}
                    for key, value in item_map.items():
                        normalized_item[str(key)] = value
                    parsed = _result_from_dict(normalized_item)
                    if parsed is not None:
                        results.append(parsed)
        best_raw = data.get("best_result")
        best_result = (
            _result_from_dict(
                {
                    str(key): value
                    for key, value in cast(dict[object, object], best_raw).items()
                }
            )
            if isinstance(best_raw, dict)
            else None
        )
        baseline_metric_raw = data.get("baseline_metric")
        baseline_metric = (
            float(baseline_metric_raw)
            if isinstance(baseline_metric_raw, (int, float))
            else None
        )
        return cls(
            results=results, best_result=best_result, baseline_metric=baseline_metric
        )


class _ChatResponse(Protocol):
    content: str


class _ChatClient(Protocol):
    def chat(
        self, messages: list[dict[str, str]], *, system: str | None = None
    ) -> _ChatResponse: ...

class _GitManager(Protocol):
    def is_git_repo(self) -> bool: ...
    def create_experiment_branch(self, tag: str) -> str: ...
    def commit_experiment(self, run_id: str, metrics: dict[str, object], description: str) -> str: ...
    def discard_experiment(self, run_id: str, reason: str) -> bool: ...
    def return_to_original_branch(self) -> bool: ...

class ExperimentRunner:
    def __init__(
        self,
        config: ExperimentConfig,
        workspace: Path,
        *,
        git_repo_dir: Path | None = None,
    ) -> None:
        self.config: ExperimentConfig = config
        self.workspace: Path = workspace
        self.workspace.mkdir(parents=True, exist_ok=True)
        self.remote_config: SshRemoteConfig = config.ssh_remote
        self.sandbox: SandboxProtocol = create_sandbox(config, workspace / "sandbox")
        self.history: ExperimentHistory = ExperimentHistory()
        # Git integration (Phase 3)
        self._git: _GitManager | None = None
        if git_repo_dir is not None:
            from researchclaw.experiment.git_manager import ExperimentGitManager
            mgr = ExperimentGitManager(git_repo_dir)
            if mgr.is_git_repo():
                self._git = mgr
            else:
                logger.warning("git_repo_dir %s is not a git repo; git integration disabled", git_repo_dir)

    def run_experiment(
        self, code: str, *, run_id: str, iteration: int = 0
    ) -> ExperimentResult:
        sandbox_result = self.sandbox.run(code, timeout_sec=self.config.time_budget_sec)
        primary_metric = self._to_float(
            sandbox_result.metrics.get(self.config.metric_key)
        )
        current_best = (
            self.history.best_result.primary_metric
            if self.history.best_result
            else None
        )

        improved = False
        kept = False

        if primary_metric is not None:
            if current_best is None:
                improved = True
                kept = True
            elif self._is_improvement(primary_metric, current_best):
                improved = True
                kept = abs(primary_metric - current_best) > self.config.keep_threshold

        error: str | None = None
        if sandbox_result.timed_out:
            error = f"Timed out after {self.config.time_budget_sec}s"
        elif sandbox_result.returncode != 0:
            error = (
                sandbox_result.stderr.strip()
                or f"Process exited with {sandbox_result.returncode}"
            )

        result = ExperimentResult(
            run_id=run_id,
            iteration=iteration,
            code=code,
            metrics=sandbox_result.metrics,
            primary_metric=primary_metric,
            improved=improved,
            kept=kept,
            elapsed_sec=sandbox_result.elapsed_sec,
            stdout=sandbox_result.stdout,
            stderr=sandbox_result.stderr,
            error=error,
        )

        if kept:
            self.history.best_result = result

        self.history.add(result)
        return result

    def run_loop(
        self, initial_code: str, *, run_id: str, llm: _ChatClient | None = None
    ) -> ExperimentHistory:
        # Phase 3: Create experiment branch if git is available
        if self._git is not None:
            branch = self._git.create_experiment_branch(run_id)
            if branch:
                logger.info("Created experiment branch: %s", branch)

        current_code = initial_code
        baseline = self.run_experiment(current_code, run_id=run_id, iteration=0)

        # Phase 3: Commit baseline
        if self._git is not None and baseline.kept:
            self._git.commit_experiment(
                run_id=f"{run_id}-iter0",
                metrics=baseline.metrics,
                description=f"Baseline: {self.config.metric_key}={baseline.primary_metric}",
            )

        if llm is None:
            return self.history

        no_improvement_count = 0
        for iteration in range(1, self.config.max_iterations + 1):
            next_code = self._improve_code(llm, current_code, self.history)
            result = self.run_experiment(next_code, run_id=run_id, iteration=iteration)
            current_code = next_code

            # Phase 3: Git commit/discard based on result
            if self._git is not None:
                if result.kept:
                    self._git.commit_experiment(
                        run_id=f"{run_id}-iter{iteration}",
                        metrics=result.metrics,
                        description=f"Iter {iteration}: {self.config.metric_key}={result.primary_metric}",
                    )
                else:
                    self._git.discard_experiment(
                        run_id=f"{run_id}-iter{iteration}",
                        reason=f"No improvement at iteration {iteration}",
                    )

            if result.improved:
                no_improvement_count = 0
            else:
                no_improvement_count += 1

            if no_improvement_count >= 3:
                logger.info("Stopping early due to 3 non-improving iterations")
                break

        # Phase 3: Return to original branch
        if self._git is not None:
            self._git.return_to_original_branch()

        return self.history

    def _improve_code(
        self, llm: _ChatClient, current_code: str, history: ExperimentHistory
    ) -> str:
        direction = self.config.metric_direction
        last_result = history.results[-1] if history.results else None
        last_metrics = last_result.metrics if last_result else {}
        best_metrics = history.best_result.metrics if history.best_result else {}
        last_metric = last_result.primary_metric if last_result else None
        best_metric = (
            history.best_result.primary_metric if history.best_result else None
        )

        prompt = (
            "Improve the experiment code to optimize the primary metric.\n\n"
            f"Metric key: {self.config.metric_key}\n"
            f"Direction: {direction}\n"
            f"Last primary metric: {last_metric}\n"
            f"Best primary metric: {best_metric}\n"
            f"Last metrics JSON: {json.dumps(last_metrics, ensure_ascii=True)}\n"
            f"Best metrics JSON: {json.dumps(best_metrics, ensure_ascii=True)}\n\n"
            "Current code:\n"
            "```python\n"
            f"{current_code}\n"
            "```\n\n"
            "Return only the updated Python code."
        )

        try:
            response = llm.chat(
                [{"role": "user", "content": prompt}],
                system="You are an expert ML experimentation assistant.",
            )
        except Exception as exc:  # noqa: BLE001
            logger.exception("Code improvement call failed: %s", exc)
            return current_code

        candidate = getattr(response, "content", "")
        if not isinstance(candidate, str) or not candidate.strip():
            logger.warning("LLM returned empty code; keeping current version")
            return current_code

        extracted = self._extract_python_code(candidate)
        return extracted if extracted.strip() else current_code

    def save_history(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        _ = path.write_text(
            json.dumps(self.history.to_dict(), indent=2), encoding="utf-8"
        )

    def _is_improvement(self, new_value: float, best_value: float) -> bool:
        if self.config.metric_direction == "maximize":
            return new_value > best_value
        return new_value < best_value

    @staticmethod
    def _to_float(value: object) -> float | None:
        if isinstance(value, bool):
            return None
        if isinstance(value, (int, float)):
            return float(value)
        if isinstance(value, str):
            try:
                return float(value)
            except ValueError:
                return None
        return None

    @staticmethod
    def _extract_python_code(content: str) -> str:
        match = re.search(r"```(?:python)?\s*(.*?)\s*```", content, flags=re.DOTALL)
        if match is None:
            return content.strip()
        return match.group(1).strip()


def _result_from_dict(data: dict[str, object]) -> ExperimentResult | None:
    run_id = data.get("run_id")
    iteration = data.get("iteration")
    code = data.get("code")
    metrics = data.get("metrics")
    primary_metric = data.get("primary_metric")
    improved = data.get("improved")
    kept = data.get("kept")
    elapsed_sec = data.get("elapsed_sec")
    stdout = data.get("stdout")
    stderr = data.get("stderr")
    error = data.get("error")

    if not isinstance(run_id, str) or not isinstance(iteration, int):
        return None
    if not isinstance(code, str) or not isinstance(metrics, dict):
        return None
    if primary_metric is not None and not isinstance(primary_metric, (int, float)):
        return None
    if not isinstance(improved, bool) or not isinstance(kept, bool):
        return None
    if not isinstance(elapsed_sec, (int, float)):
        return None
    if not isinstance(stdout, str) or not isinstance(stderr, str):
        return None
    if error is not None and not isinstance(error, str):
        return None

    typed_metrics: dict[str, object] = {}
    for key, value in cast(dict[object, object], metrics).items():
        typed_metrics[str(key)] = value
    return ExperimentResult(
        run_id=run_id,
        iteration=iteration,
        code=code,
        metrics=typed_metrics,
        primary_metric=float(primary_metric)
        if isinstance(primary_metric, (int, float))
        else None,
        improved=improved,
        kept=kept,
        elapsed_sec=float(elapsed_sec),
        stdout=stdout,
        stderr=stderr,
        error=error,
    )


================================================
FILE: researchclaw/experiment/sandbox.py
================================================
"""Sandbox environment for safe experiment code execution."""

from __future__ import annotations

import logging
import math
import os
import re
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol

from researchclaw.config import SandboxConfig
from researchclaw.hardware import is_metric_name

logger = logging.getLogger(__name__)


def validate_entry_point(entry_point: str) -> str | None:
    """Validate *entry_point* syntax (no filesystem access needed).

    Returns an error message if invalid, ``None`` if valid.
    Call this **before** copying files to fail fast on obviously bad input.
    """
    if not entry_point or not entry_point.strip():
        return "Entry point is empty"
    ep = Path(entry_point)
    if ep.is_absolute():
        return f"Entry point must be a relative path, got: {entry_point}"
    if ".." in ep.parts:
        return f"Entry point must not contain '..': {entry_point}"
    return None


def validate_entry_point_resolved(staging: Path, entry_point: str) -> str | None:
    """Validate that *entry_point* resolves inside *staging*.

    Returns an error message if invalid, ``None`` if valid.
    Call this **after** copying files so that symlinks are resolved against
    the real staging contents.
    """
    resolved = (staging / entry_point).resolve()
    staging_resolved = staging.resolve()
    if not resolved.is_relative_to(staging_resolved):
        return f"Entry point escapes staging directory: {entry_point}"
    return None

# Matches both plain "metric: value" and "condition=xxx metric: value" formats
_FLOAT_RE = r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?"
_METRIC_PATTERN = re.compile(
    rf"^(?:\S+=\S+\s+)?(\w[\w.]*)\s*:\s*({_FLOAT_RE})\s*$"
)
# R17: Extract per-condition metrics with optional extra tags:
#   "condition=<name> [regime=<r>] [H=<h>] [seed=<s>] metric: value"
# Captures: (condition_name, extra_tags_string, metric_name, value)
_CONDITION_METRIC_PATTERN = re.compile(
    rf"^condition=(\S+)\s+((?:\S+=\S+\s+)*)(\w[\w.]*)\s*:\s*({_FLOAT_RE})\s*$"
)
# R16-1: Ratio format with optional extra tags
_CONDITION_RATIO_PATTERN = re.compile(
    r"^condition=(\S+)\s+((?:\S+=\S+\s+)*)(\w[\w.]*)\s*:\s*(\d+)/(\d+)\s*$"
)
# BUG-181: Parse SUMMARY lines: "SUMMARY condition=X metric=Y mean=M std=S [success_rate=R]"
_SUMMARY_PATTERN = re.compile(
    r"^SUMMARY\s+condition=(\S+)\s+metric=(\S+)\s+mean=("
    + _FLOAT_RE
    + r")\s+std=("
    + _FLOAT_RE
    + r")"
)
# BUG-181: Multi-metric condition line: extract all "metric: value" pairs
_CONDITION_MULTI_METRIC_RE = re.compile(
    r"(\w[\w.]*)\s*:\s*(" + _FLOAT_RE + r")"
)


def _to_text(value: str | bytes | None) -> str:
    if value is None:
        return ""
    if isinstance(value, bytes):
        return value.decode("utf-8", errors="replace")
    return value


def parse_metrics(stdout: str) -> dict[str, float]:
    metrics: dict[str, float] = {}
    for line in stdout.splitlines():
        stripped = line.strip()

        # BUG-181: Parse SUMMARY lines first (most reliable, one metric per line)
        # Format: "SUMMARY condition=X metric=Y mean=M std=S [success_rate=R]"
        summary_match = _SUMMARY_PATTERN.match(stripped)
        if summary_match:
            cond_name, metric_name, mean_str, std_str = summary_match.groups()
            if is_metric_name(metric_name):
                try:
                    mean_val = float(mean_str)
                    std_val = float(std_str)
                except ValueError:
                    continue
                if not (math.isnan(mean_val) or math.isinf(mean_val)):
                    metrics[f"{cond_name}/{metric_name}"] = mean_val
                    metrics[f"{cond_name}/{metric_name}_mean"] = mean_val
                    metrics[f"{cond_name}/{metric_name}_std"] = std_val
                    metrics[metric_name] = mean_val
            continue

        # R16-1: Try ratio format first: "condition=X [tags] metric: N/M"
        ratio_match = _CONDITION_RATIO_PATTERN.match(stripped)
        if ratio_match:
            cond_name, extra_tags, name, num, den = ratio_match.groups()
            if is_metric_name(name):
                try:
                    val = float(num) / float(den) if float(den) != 0 else 0.0
                except (ValueError, ZeroDivisionError):
                    continue
                # Build composite key from condition + extra tags
                tag_parts = [cond_name]
                for tag in extra_tags.strip().split():
                    if "=" in tag:
                        tag_parts.append(tag.split("=", 1)[1])
                composite_key = "/".join(tag_parts)
                metrics[f"{composite_key}/{name}"] = val
                metrics[f"{cond_name}/{name}"] = val
                metrics[name] = val
            continue

        # Try condition-prefixed format: "condition=X [tags] metric: value"
        cond_match = _CONDITION_METRIC_PATTERN.match(stripped)
        if cond_match:
            cond_name, extra_tags, name, value = cond_match.groups()
            if is_metric_name(name):
                try:
                    val = float(value)
                except ValueError:
                    continue
                if math.isnan(val) or math.isinf(val):
                    logger.warning("Skipping non-finite metric %s=%s", name, value)
                    continue
                # Build composite key from condition + extra tags
                tag_parts = [cond_name]
                for tag in extra_tags.strip().split():
                    if "=" in tag:
                        tag_parts.append(tag.split("=", 1)[1])
                composite_key = "/".join(tag_parts)
                metrics[f"{composite_key}/{name}"] = val
                metrics[f"{cond_name}/{name}"] = val
                metrics[name] = val
            continue

        # BUG-181: Multi-metric condition line fallback
        # Handles: "condition=X seed=S metric1: v1 metric2: v2 ..."
        # (lines not matched by _CONDITION_METRIC_PATTERN due to multiple metrics)
        if stripped.startswith("condition="):
            _parts = stripped.split()
            _cond = _parts[0].split("=", 1)[1] if "=" in _parts[0] else None
            _seed = None
            for _p in _parts[1:]:
                if _p.startswith("seed="):
                    _seed = _p.split("=", 1)[1]
                    break
            if _cond:
                for _mm in _CONDITION_MULTI_METRIC_RE.finditer(stripped):
                    _mname, _mval_str = _mm.groups()
                    if is_metric_name(_mname):
                        try:
                            _mval = float(_mval_str)
                        except ValueError:
                            continue
                        if math.isnan(_mval) or math.isinf(_mval):
                            continue
                        if _seed is not None:
                            metrics[f"{_cond}/{_seed}/{_mname}"] = _mval
                        metrics[f"{_cond}/{_mname}"] = _mval
                        metrics[_mname] = _mval
                continue

        # Plain format: "metric: value"
        match = _METRIC_PATTERN.match(stripped)
        if match is None:
            continue
        name, value = match.groups()
        if not is_metric_name(name):
            continue
        try:
            val = float(value)
        except ValueError:
            continue
        # R5-3: Skip NaN/Inf values — they indicate divergence
        if math.isnan(val) or math.isinf(val):
            logger.warning("Skipping non-finite metric %s=%s", name, value)
            continue
        metrics[name] = val
    return metrics


def extract_paired_comparisons(stdout: str) -> list[dict[str, object]]:
    """R18-1: Extract PAIRED statistical comparison lines from stdout.

    Matches: PAIRED: <method> vs <baseline> [regime=<r>] mean_diff=<v> ...
    Returns a list of dicts with method, baseline, regime, and stats.
    """
    results: list[dict[str, object]] = []
    pattern = re.compile(
        r"^PAIRED:\s+(\S+)\s+vs\s+(\S+)\s*(.*?)mean_diff=([+-]?\d+\.?\d*)"
        r".*?std_diff=([+-]?\d+\.?\d*)"
        r".*?t_stat=([+-]?\d+\.?\d*)"
        r".*?p_value=([+-]?\d+\.?\d*)"
    )
    for line in stdout.splitlines():
        m = pattern.match(line.strip())
        if m:
            method, baseline, tags, mean_diff, std_diff, t_stat, p_value = m.groups()
            entry: dict[str, object] = {
                "method": method,
                "baseline": baseline,
                "mean_diff": float(mean_diff),
                "std_diff": float(std_diff),
                "t_stat": float(t_stat),
                "p_value": float(p_value),
            }
            # Extract regime if present
            regime_m = re.search(r"regime=(\S+)", tags)
            if regime_m:
                entry["regime"] = regime_m.group(1)
            # Extract CI if present
            ci_m = re.search(r"ci95=\(([^,]+),([^)]+)\)", line)
            if ci_m:
                entry["ci95_low"] = float(ci_m.group(1))
                entry["ci95_high"] = float(ci_m.group(2))
            results.append(entry)
    return results


def detect_nan_divergence(stdout: str, stderr: str) -> str | None:
    """Check stdout/stderr for NaN/Inf/divergence indicators.

    Returns a description of the issue if detected, None otherwise.
    """
    issues: list[str] = []
    combined = (stdout or "") + "\n" + (stderr or "")
    lower = combined.lower()

    # Check for NaN indicators
    if "nan" in lower:
        for pattern in ("loss: nan", "nan loss", "math domain error", "loss is nan"):
            if pattern in lower:
                issues.append(f"NaN detected: '{pattern}' found in output")
                break
        else:
            # Generic NaN mention — could be a false positive but worth flagging
            if re.search(r"\bnan\b", lower):
                issues.append("Possible NaN detected in output")

    # Check for Inf indicators
    if "inf" in lower:
        if re.search(r"\binf\b", lower) and "info" not in lower.split("inf")[0][-4:]:
            issues.append("Possible Inf value detected in output")

    # Check for divergence (loss > 100 is a common fast-fail threshold)
    for line in stdout.splitlines():
        match = _METRIC_PATTERN.match(line.strip())
        if match:
            name, value = match.groups()
            try:
                val = float(value)
                if math.isnan(val) or math.isinf(val):
                    issues.append(f"Non-finite metric: {name}={value}")
                elif "loss" in name.lower() and val > 100:
                    issues.append(f"Diverging loss: {name}={val} (>100)")
            except ValueError:
                pass

    return "; ".join(issues) if issues else None


@dataclass(frozen=True)
class SandboxResult:
    returncode: int
    stdout: str
    stderr: str
    elapsed_sec: float
    metrics: dict[str, object]
    timed_out: bool = False


class SandboxProtocol(Protocol):
    """Structural type for sandbox backends (ExperimentSandbox, DockerSandbox)."""

    def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: ...

    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> SandboxResult: ...


class ExperimentSandbox:
    def __init__(self, config: SandboxConfig, workdir: Path) -> None:
        self.config: SandboxConfig = config
        self.workdir: Path = workdir.resolve()
        self.workdir.mkdir(parents=True, exist_ok=True)
        self._run_counter: int = 0

    def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult:
        script_path = self._next_script_path()
        self._write_script(script_path, code)

        start = time.monotonic()
        command = self._build_command(script_path)
        logger.debug("Running sandbox command: %s", command)

        result: SandboxResult
        try:
            env = {**os.environ, "PYTHONUNBUFFERED": "1"}
            completed = subprocess.run(
                command,
                capture_output=True,
                text=True,
                timeout=timeout_sec,
                cwd=self.workdir,
                env=env,
                check=False,
            )
            result = self._result_from_completed(
                completed, elapsed_sec=time.monotonic() - start
            )
        except subprocess.TimeoutExpired as exc:
            result = self._result_from_timeout(
                exc, timeout_sec=timeout_sec, elapsed_sec=time.monotonic() - start
            )
        except Exception as exc:  # noqa: BLE001
            result = self._result_from_exception(
                exc, elapsed_sec=time.monotonic() - start
            )

        if self._should_cleanup(result):
            self._cleanup_script(script_path)

        return result

    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> SandboxResult:
        """Run a multi-file experiment project in the sandbox.

        Copies all ``.py`` files from *project_dir* into the sandbox work
        directory and executes *entry_point*.
        """
        import shutil

        # BUG-DA8-06: Use unique dir name to prevent races under concurrent calls
        self._run_counter += 1
        sandbox_project = self.workdir / f"_project_{self._run_counter}"
        if sandbox_project.exists():
            shutil.rmtree(sandbox_project)
        sandbox_project.mkdir(parents=True, exist_ok=True)

        # Pre-copy syntax validation — fail fast before any I/O
        err = validate_entry_point(entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        # R5-4: Inject immutable experiment harness before copying project files
        self._inject_harness(sandbox_project)

        # Copy all project files (will NOT overwrite harness — harness name is unique)
        for src_file in project_dir.iterdir():
            if src_file.is_file():
                dest = sandbox_project / src_file.name
                # Do not allow project to overwrite the harness
                if dest.name == "experiment_harness.py":
                    logger.warning("Project contains experiment_harness.py — skipping (immutable)")
                    continue
                dest.write_bytes(src_file.read_bytes())
            elif src_file.is_dir() and not src_file.name.startswith("."):
                import shutil as _shutil_proj
                dest_dir = sandbox_project / src_file.name
                _shutil_proj.copytree(src_file, dest_dir, dirs_exist_ok=True)

        # Post-copy resolve check — catches symlink-based escapes
        err = validate_entry_point_resolved(sandbox_project, entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        entry = sandbox_project / entry_point
        if not entry.exists():
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Entry point {entry_point} not found in project",
                elapsed_sec=0.0,
                metrics={},
            )

        start = time.monotonic()
        command = self._build_command(entry)
        logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project)

        result: SandboxResult
        try:
            env = {**os.environ, "PYTHONUNBUFFERED": "1"}
            completed = subprocess.run(
                command,
                capture_output=True,
                text=True,
                timeout=timeout_sec,
                cwd=sandbox_project,
                env=env,
                check=False,
            )
            result = self._result_from_completed(
                completed, elapsed_sec=time.monotonic() - start
            )
        except subprocess.TimeoutExpired as exc:
            result = self._result_from_timeout(
                exc, timeout_sec=timeout_sec, elapsed_sec=time.monotonic() - start
            )
        except Exception as exc:  # noqa: BLE001
            result = self._result_from_exception(
                exc, elapsed_sec=time.monotonic() - start
            )

        return result

    @staticmethod
    def _inject_harness(target_dir: Path) -> None:
        """Copy the immutable experiment harness into the target directory."""
        harness_src = Path(__file__).parent / "harness_template.py"
        if harness_src.exists():
            dest = target_dir / "experiment_harness.py"
            dest.write_text(harness_src.read_text(encoding="utf-8"), encoding="utf-8")
            logger.debug("Injected experiment harness into %s", target_dir)
        else:
            logger.warning("Harness template not found at %s", harness_src)

    def _next_script_path(self) -> Path:
        self._run_counter += 1
        return self.workdir / f"_experiment_{self._run_counter}.py"

    @staticmethod
    def _write_script(script_path: Path, code: str) -> None:
        _ = script_path.write_text(code, encoding="utf-8")

    def _build_command(self, script_path: Path) -> list[str]:
        # Convert relative python_path to absolute WITHOUT resolving symlinks.
        # Using .resolve() would follow venv symlinks to the system Python binary,
        # which loses the venv context (site-packages like numpy become unavailable).
        python = self.config.python_path
        python_path = Path(python)
        if not python_path.is_absolute():
            python_path = Path.cwd() / python_path
        # -u: unbuffered stdout/stderr so subprocess.run captures all output
        return [str(python_path), "-u", str(script_path)]

    @staticmethod
    def _result_from_completed(
        completed: subprocess.CompletedProcess[str], *, elapsed_sec: float
    ) -> SandboxResult:
        metrics = parse_metrics(completed.stdout)
        return SandboxResult(
            returncode=completed.returncode,
            stdout=completed.stdout,
            stderr=completed.stderr,
            elapsed_sec=elapsed_sec,
            metrics={key: value for key, value in metrics.items()},
        )

    @staticmethod
    def _result_from_timeout(
        exc: subprocess.TimeoutExpired,
        *,
        timeout_sec: int,
        elapsed_sec: float,
    ) -> SandboxResult:
        stdout = _to_text(exc.stdout)
        stderr = _to_text(exc.stderr)
        metrics = parse_metrics(stdout)
        logger.warning("Sandbox execution timed out after %ss", timeout_sec)
        return SandboxResult(
            returncode=-1,
            stdout=stdout,
            stderr=stderr,
            elapsed_sec=elapsed_sec,
            metrics={key: value for key, value in metrics.items()},
            timed_out=True,
        )

    @staticmethod
    def _result_from_exception(exc: Exception, *, elapsed_sec: float) -> SandboxResult:
        logger.exception("Sandbox execution failed: %s", exc)
        return SandboxResult(
            returncode=-1,
            stdout="",
            stderr=str(exc),
            elapsed_sec=elapsed_sec,
            metrics={},
        )

    @staticmethod
    def _should_cleanup(result: SandboxResult) -> bool:
        return result.returncode == 0 and not result.timed_out

    @staticmethod
    def _cleanup_script(script_path: Path) -> None:
        try:
            script_path.unlink(missing_ok=True)
        except Exception:  # noqa: BLE001
            logger.warning("Failed to delete temporary file: %s", script_path)


================================================
FILE: researchclaw/experiment/ssh_sandbox.py
================================================
"""SSH remote sandbox for experiment code execution on remote GPU servers.

Uploads experiment code via scp, executes via ssh, and collects results.
Supports any SSH-accessible machine including cloud VMs, lab servers,
and Colab instances with SSH tunnels.
"""

from __future__ import annotations

import logging
import os
import shlex
import shutil
import subprocess
import time
import uuid
from pathlib import Path

from researchclaw.config import SshRemoteConfig
from researchclaw.experiment.sandbox import (
    SandboxResult,
    parse_metrics,
    validate_entry_point,
    validate_entry_point_resolved,
)

logger = logging.getLogger(__name__)


class SshRemoteSandbox:
    """Execute experiment code on a remote machine via SSH.

    Same public API as :class:`ExperimentSandbox` and :class:`DockerSandbox`
    so the pipeline can use any backend transparently.

    Execution model:
      1. Create a unique run directory on the remote host
      2. Upload code (and harness) via scp
      3. Optionally run setup commands (pip install, conda activate, etc.)
      4. Execute the experiment script via ssh
      5. Parse stdout for metrics
      6. Clean up the remote run directory
    """

    def __init__(self, config: SshRemoteConfig, workdir: Path) -> None:
        self.config = config
        self.workdir = workdir.resolve()
        self.workdir.mkdir(parents=True, exist_ok=True)
        self._run_counter = 0

    # ------------------------------------------------------------------
    # Public API (matches SandboxProtocol)
    # ------------------------------------------------------------------

    def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult:
        """Run a single Python code string on the remote host."""
        self._run_counter += 1
        staging = self.workdir / f"_ssh_run_{self._run_counter}"
        staging.mkdir(parents=True, exist_ok=True)

        script_path = staging / "main.py"
        script_path.write_text(code, encoding="utf-8")

        self._inject_harness(staging)

        return self._execute(staging, entry_point="main.py", timeout_sec=timeout_sec)

    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> SandboxResult:
        """Run a multi-file experiment project on the remote host."""
        self._run_counter += 1
        staging = self.workdir / f"_ssh_project_{self._run_counter}"
        if staging.exists():
            shutil.rmtree(staging)
        staging.mkdir(parents=True, exist_ok=True)

        # Pre-copy syntax validation — fail fast before any I/O
        err = validate_entry_point(entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        self._inject_harness(staging)

        for src_item in project_dir.iterdir():
            dest = staging / src_item.name
            if dest.name == "experiment_harness.py":
                logger.warning(
                    "Project contains experiment_harness.py — skipping (immutable)"
                )
                continue
            if src_item.is_dir():
                shutil.copytree(src_item, dest, dirs_exist_ok=True)
            elif src_item.is_file():
                dest.write_bytes(src_item.read_bytes())

        # Post-copy resolve check — catches symlink-based escapes
        err = validate_entry_point_resolved(staging, entry_point)
        if err:
            return SandboxResult(
                returncode=-1, stdout="", stderr=err,
                elapsed_sec=0.0, metrics={},
            )

        entry = staging / entry_point
        if not entry.exists():
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Entry point {entry_point} not found in project",
                elapsed_sec=0.0,
                metrics={},
            )

        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)

    # ------------------------------------------------------------------
    # Static helpers
    # ------------------------------------------------------------------

    @staticmethod
    def check_ssh_available(config: SshRemoteConfig) -> tuple[bool, str]:
        """Return (ok, message) after testing SSH connectivity."""
        if not config.host:
            return False, "ssh_remote.host is empty"
        cmd = _build_ssh_base(config, extra_opts=["-o", "ConnectTimeout=10"])
        cmd.append("echo researchclaw-ssh-ok")
        try:
            cp = subprocess.run(
                cmd, capture_output=True, text=True, timeout=15, check=False,
            )
            if cp.returncode == 0 and "researchclaw-ssh-ok" in cp.stdout:
                return True, f"SSH connection to {config.host} OK"
            return False, f"SSH test failed (exit {cp.returncode}): {cp.stderr.strip()}"
        except subprocess.TimeoutExpired:
            return False, f"SSH connection to {config.host} timed out"
        except FileNotFoundError:
            return False, "ssh command not found on PATH"

    @staticmethod
    def _inject_harness(target_dir: Path) -> None:
        harness_src = Path(__file__).parent / "harness_template.py"
        if harness_src.exists():
            dest = target_dir / "experiment_harness.py"
            dest.write_text(
                harness_src.read_text(encoding="utf-8"), encoding="utf-8"
            )

    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------

    def _execute(
        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
    ) -> SandboxResult:
        """Core execution flow for remote experiments.

        Steps:
          1. Create a unique temporary directory on the remote host
          2. Upload experiment files via scp
          3. Run any user-defined setup commands (pip install, etc.)
          4. Execute the experiment (bare Python or Docker container)
          5. Parse metrics from stdout (same format as local sandbox)
          6. Clean up the remote directory regardless of outcome
        """
        cfg = self.config
        run_id = f"rc-{uuid.uuid4().hex[:8]}"
        remote_dir = f"{cfg.remote_workdir}/{run_id}"
        remote_dir_q = shlex.quote(remote_dir)

        # 1. Create remote directory
        mkdir_ok = self._ssh_run(f"mkdir -p {remote_dir_q}")
        if mkdir_ok.returncode != 0:
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Failed to create remote directory: {mkdir_ok.stderr}",
                elapsed_sec=0.0,
                metrics={},
            )

        # 2. Upload code
        upload_ok = self._scp_upload(staging_dir, remote_dir)
        if not upload_ok:
            self._ssh_run(f"rm -rf {remote_dir_q}", timeout_sec=15)
            return SandboxResult(
                returncode=-1,
                stdout="",
                stderr=f"Failed to upload code to {cfg.host}:{remote_dir}",
                elapsed_sec=0.0,
                metrics={},
            )

        # 3. Run setup commands (pip install, conda activate, etc.)
        for setup_cmd in cfg.setup_commands:
            setup_result = self._ssh_run(
                f"cd {remote_dir_q} && {setup_cmd}",
                timeout_sec=cfg.setup_timeout_sec,
            )
            if setup_result.returncode != 0:
                logger.warning(
                    "Setup command failed: %s (exit %d): %s",
                    setup_cmd, setup_result.returncode, setup_result.stderr,
                )

        # 4. Execute experiment
        if cfg.use_docker:
            exec_cmd = self._build_docker_exec_cmd(
                remote_dir, entry_point=entry_point,
            )
        else:
            exec_cmd = self._build_bare_exec_cmd(
                remote_dir, entry_point=entry_point,
            )

        start = time.monotonic()
        result = self._ssh_run(exec_cmd, timeout_sec=timeout_sec)
        elapsed = time.monotonic() - start

        timed_out = result.timed_out

        # 5. Parse metrics from stdout
        metrics = parse_metrics(result.stdout)

        # 6. Clean up remote directory
        self._ssh_run(f"rm -rf {remote_dir_q}", timeout_sec=15)

        return SandboxResult(
            returncode=result.returncode,
            stdout=result.stdout,
            stderr=result.stderr,
            elapsed_sec=elapsed,
            metrics=metrics,
            timed_out=timed_out,
        )

    def _build_bare_exec_cmd(
        self, remote_dir: str, *, entry_point: str,
    ) -> str:
        """Build command to run Python directly on remote host (with basic sandboxing)."""
        cfg = self.config
        rd = shlex.quote(remote_dir)
        ep = shlex.quote(entry_point)
        py = shlex.quote(cfg.remote_python)

        gpu_env = ""
        if cfg.gpu_ids:
            gpu_env = f"CUDA_VISIBLE_DEVICES={','.join(str(g) for g in cfg.gpu_ids)} "

        # Security layers:
        # 1. HOME override — prevents reading ~/.ssh, ~/.bashrc, etc.
        # 2. unshare --net — drops network access (Linux only).
        # 3. If unshare unavailable, still runs with HOME override but
        #    logs a warning so the user knows network isolation is missing.
        return (
            f"cd {rd} && "
            f"if command -v unshare >/dev/null 2>&1; then "
            f"HOME={rd} "
            f"{gpu_env}"
            f"unshare --net {py} -u {ep}; "
            f"else "
            f"echo 'WARNING: unshare not available, running without network isolation' >&2; "
            f"HOME={rd} "
            f"{gpu_env}"
            f"{py} -u {ep}; "
            f"fi"
        )

    def _build_docker_exec_cmd(
        self, remote_dir: str, *, entry_point: str,
    ) -> str:
        """Build command to run inside a Docker container on the remote host.

        This is the most secure execution mode: code runs in an isolated
        container with restricted network, memory limits, and no access
        to the host filesystem beyond the experiment directory.
        """
        cfg = self.config
        parts = [
            "docker", "run", "--rm",
            "-v", f"{shlex.quote(remote_dir)}:/workspace",
            "-w", "/workspace",
            # BUG-DA8-14: Mirror local Docker sandbox security hardening
            "-e", "HOME=/workspace/.home",
            "-e", "TORCH_HOME=/workspace/.home/.cache/torch",
            "-e", "MPLCONFIGDIR=/tmp/matplotlib",
            f"--memory={cfg.docker_memory_limit_mb}m",
            f"--shm-size={cfg.docker_shm_size_mb}m",
        ]

        # Network isolation
        if cfg.docker_network_policy == "none":
            parts.extend(["--network", "none"])

        # GPU passthrough
        if cfg.gpu_ids:
            device_spec = ",".join(str(g) for g in cfg.gpu_ids)
            parts.extend(["--gpus", f"device={device_spec}"])
        else:
            # Try to pass all GPUs; fails gracefully if none available
            parts.extend(["--gpus", "all"])

        parts.append(shlex.quote(cfg.docker_image))
        parts.extend(["python3", "-u", shlex.quote(entry_point)])

        return " ".join(parts)

    def _ssh_run(
        self, command: str, *, timeout_sec: int | None = None
    ) -> _SshResult:
        """Execute a command on the remote host via ssh."""
        if timeout_sec is None:
            timeout_sec = self.config.timeout_sec
        cmd = _build_ssh_base(self.config) + [command]
        try:
            cp = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=timeout_sec,
                check=False,
            )
            return _SshResult(
                returncode=cp.returncode,
                stdout=cp.stdout,
                stderr=cp.stderr,
            )
        except subprocess.TimeoutExpired as exc:
            stdout = exc.stdout or ""
            stderr = exc.stderr or ""
            if isinstance(stdout, bytes):
                stdout = stdout.decode("utf-8", errors="replace")
            if isinstance(stderr, bytes):
                stderr = stderr.decode("utf-8", errors="replace")
            return _SshResult(
                returncode=-1,
                stdout=stdout,
                stderr=stderr,
                timed_out=True,
            )
        except Exception as exc:  # noqa: BLE001
            return _SshResult(
                returncode=-1,
                stdout="",
                stderr=str(exc),
            )

    def _scp_upload(self, local_dir: Path, remote_dir: str) -> bool:
        """Upload all files from local_dir to remote_dir via scp."""
        cfg = self.config
        target = f"{_ssh_target(cfg)}:{remote_dir}/"

        cmd = ["scp", "-r", "-o", "StrictHostKeyChecking=no"]
        if cfg.port != 22:
            cmd.extend(["-P", str(cfg.port)])
        if cfg.key_path:
            cmd.extend(["-i", os.path.expanduser(cfg.key_path)])

        # Upload all files and directories in the staging directory
        items = [str(f) for f in local_dir.iterdir()]
        if not items:
            return True
        cmd.extend(items)
        cmd.append(target)

        try:
            cp = subprocess.run(
                cmd, capture_output=True, text=True,
                timeout=cfg.scp_timeout_sec, check=False,
            )
            if cp.returncode != 0:
                logger.error("scp upload failed: %s", cp.stderr.strip())
            return cp.returncode == 0
        except (subprocess.TimeoutExpired, FileNotFoundError) as exc:
            logger.error("scp upload error: %s", exc)
            return False


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

class _SshResult:
    __slots__ = ("returncode", "stdout", "stderr", "timed_out")

    def __init__(
        self,
        returncode: int,
        stdout: str,
        stderr: str,
        timed_out: bool = False,
    ) -> None:
        self.returncode = returncode
        self.stdout = stdout
        self.stderr = stderr
        self.timed_out = timed_out


def _ssh_target(cfg: SshRemoteConfig) -> str:
    """Build user@host string."""
    if cfg.user:
        return f"{cfg.user}@{cfg.host}"
    return cfg.host


def _build_ssh_base(
    cfg: SshRemoteConfig,
    extra_opts: list[str] | None = None,
) -> list[str]:
    """Build the base ssh command with common options.

    *extra_opts* are inserted **before** the hostname so that SSH
    interprets them as SSH options, not as part of the remote command.
    """
    cmd = [
        "ssh",
        "-o", "StrictHostKeyChecking=no",
        "-o", "BatchMode=yes",
    ]
    if cfg.port != 22:
        cmd.extend(["-p", str(cfg.port)])
    if cfg.key_path:
        cmd.extend(["-i", os.path.expanduser(cfg.key_path)])
    if extra_opts:
        cmd.extend(extra_opts)
    cmd.append(_ssh_target(cfg))
    return cmd


================================================
FILE: researchclaw/experiment/validator.py
================================================
"""Experiment code validation: syntax, security, and import checks.

This module provides pre-execution validation for LLM-generated experiment
code.  It catches common issues *before* running code in the sandbox,
enabling automated repair via LLM re-generation.
"""

from __future__ import annotations

import ast
import sys
from dataclasses import dataclass, field
from typing import Any

# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------


@dataclass
class ValidationIssue:
    """A single validation finding."""

    severity: str  # "error" | "warning"
    category: str  # "syntax" | "security" | "import" | "style"
    message: str
    line: int | None = None
    col: int | None = None


@dataclass
class CodeValidation:
    """Aggregated validation result for a code snippet."""

    issues: list[ValidationIssue] = field(default_factory=list)

    @property
    def ok(self) -> bool:
        return not any(i.severity == "error" for i in self.issues)

    @property
    def errors(self) -> list[ValidationIssue]:
        return [i for i in self.issues if i.severity == "error"]

    @property
    def warnings(self) -> list[ValidationIssue]:
        return [i for i in self.issues if i.severity == "warning"]

    def summary(self) -> str:
        errs = len(self.errors)
        warns = len(self.warnings)
        if errs == 0 and warns == 0:
            return "Code validation passed."
        parts: list[str] = []
        if errs:
            parts.append(f"{errs} error(s)")
        if warns:
            parts.append(f"{warns} warning(s)")
        return "Code validation: " + ", ".join(parts)


# ---------------------------------------------------------------------------
# Dangerous call patterns (security scan)
# ---------------------------------------------------------------------------

# Fully-qualified call names that are forbidden in experiment code.
DANGEROUS_CALLS: frozenset[str] = frozenset(
    {
        "os.system",
        "os.popen",
        "os.exec",
        "os.execl",
        "os.execle",
        "os.execlp",
        "os.execlpe",
        "os.execv",
        "os.execve",
        "os.execvp",
        "os.execvpe",
        "os.remove",
        "os.unlink",
        "os.rmdir",
        "os.removedirs",
        "subprocess.call",
        "subprocess.run",
        "subprocess.Popen",
        "subprocess.check_call",
        "subprocess.check_output",
        "shutil.rmtree",
    }
)

# Bare built-in names that should never appear in experiment code.
DANGEROUS_BUILTINS: frozenset[str] = frozenset(
    {
        "eval",
        "exec",
        "compile",
        "__import__",
    }
)

# Modules that should not be imported at all.
BANNED_MODULES: frozenset[str] = frozenset(
    {
        "subprocess",
        "shutil",
        "socket",
        "http",
        "urllib",
        "requests",
        "ftplib",
        "smtplib",
        "ctypes",
        "signal",
    }
)

# Packages considered safe / always available in experiment sandbox.
SAFE_STDLIB: frozenset[str] = frozenset(
    {
        "abc",
        "ast",
        "bisect",
        "builtins",
        "collections",
        "contextlib",
        "copy",
        "csv",
        "dataclasses",
        "datetime",
        "decimal",
        "enum",
        "functools",
        "glob",
        "gzip",
        "hashlib",
        "heapq",
        "io",
        "itertools",
        "json",
        "logging",
        "math",
        "operator",
        "os",  # os itself is ok, certain calls aren't
        "pathlib",
        "pickle",
        "pprint",
        "random",
        "re",
        "statistics",
        "string",
        "struct",
        "sys",
        "tempfile",
        "textwrap",
        "time",
        "traceback",
        "typing",
        "unittest",
        "uuid",
        "warnings",
        "zipfile",
    }
)

COMMON_SCIENCE: frozenset[str] = frozenset(
    {
        "numpy",
        "np",
        "pandas",
        "pd",
        "scipy",
        "sklearn",
        "matplotlib",
        "plt",
        "seaborn",
        "torch",
        "tensorflow",
        "tf",
        "jax",
        "transformers",
        "datasets",
        "tqdm",
        "yaml",
        "pyyaml",
        "rich",
        # LLM training stack
        "peft",
        "trl",
        "accelerate",
        "bitsandbytes",
        "sentencepiece",
        "tokenizers",
        "safetensors",
        "evaluate",
        "rouge_score",
        # Runtime-injected by the experiment harness
        "experiment_harness",
    }
)


# ---------------------------------------------------------------------------
# AST visitor for security checks
# ---------------------------------------------------------------------------


class _SecurityVisitor(ast.NodeVisitor):
    """Walk AST to detect dangerous calls and imports."""

    def __init__(self) -> None:
        self.issues: list[ValidationIssue] = []

    # -- function calls --

    def visit_Call(self, node: ast.Call) -> None:
        name = _resolve_call_name(node.func)
        if name in DANGEROUS_BUILTINS:
            self.issues.append(
                ValidationIssue(
                    severity="error",
                    category="security",
                    message=f"Dangerous built-in call: {name}()",
                    line=node.lineno,
                    col=node.col_offset,
                )
            )
        elif name in DANGEROUS_CALLS:
            self.issues.append(
                ValidationIssue(
                    severity="error",
                    category="security",
                    message=f"Dangerous call: {name}()",
                    line=node.lineno,
                    col=node.col_offset,
                )
            )
        self.generic_visit(node)

    # -- import statements --

    def visit_Import(self, node: ast.Import) -> None:
        for alias in node.names:
            top = alias.name.split(".")[0]
            if top in BANNED_MODULES:
                self.issues.append(
                    ValidationIssue(
                        severity="error",
                        category="security",
                        message=f"Banned module import: {alias.name}",
                        line=node.lineno,
                    )
                )
        self.generic_visit(node)

    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
        if node.module:
            top = node.module.split(".")[0]
            if top in BANNED_MODULES:
                self.issues.append(
                    ValidationIssue(
                        severity="error",
                        category="security",
                        message=f"Banned module import: from {node.module}",
                        line=node.lineno,
                    )
                )
        self.generic_visit(node)


def _resolve_call_name(node: ast.expr) -> str:
    """Best-effort name resolution for a Call node's func."""
    if isinstance(node, ast.Name):
        return node.id
    if isinstance(node, ast.Attribute):
        prefix = _resolve_call_name(node.value)
        if prefix:
            return f"{prefix}.{node.attr}"
        return node.attr
    return ""


# ---------------------------------------------------------------------------
# Import extractor
# ---------------------------------------------------------------------------


def extract_imports(code: str) -> set[str]:
    """Return top-level module names imported by *code*.

    Returns an empty set if the code can't be parsed.
    """
    try:
        tree = ast.parse(code)
    except SyntaxError:
        return set()

    modules: set[str] = set()
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                modules.add(alias.name.split(".")[0])
        elif isinstance(node, ast.ImportFrom) and node.module:
            modules.add(node.module.split(".")[0])
    return modules


# ---------------------------------------------------------------------------
# Public validation functions
# ---------------------------------------------------------------------------


def validate_syntax(code: str) -> CodeValidation:
    """Check *code* parses as valid Python."""
    result = CodeValidation()
    try:
        ast.parse(code)
    except SyntaxError as exc:
        result.issues.append(
            ValidationIssue(
                severity="error",
                category="syntax",
                message=str(exc.msg) if exc.msg else str(exc),
                line=exc.lineno,
                col=exc.offset,
            )
        )
    return result


def validate_security(code: str) -> CodeValidation:
    """Scan *code* AST for dangerous calls and imports."""
    result = CodeValidation()
    try:
        tree = ast.parse(code)
    except SyntaxError:
        # If can't parse, skip security — syntax check will catch it.
        return result
    visitor = _SecurityVisitor()
    visitor.visit(tree)
    result.issues.extend(visitor.issues)
    return result


def validate_imports(
    code: str,
    available: set[str] | None = None,
) -> CodeValidation:
    """Check that all imported modules are available.

    *available* defaults to ``SAFE_STDLIB | COMMON_SCIENCE`` plus any
    modules already in ``sys.modules``.
    """
    result = CodeValidation()
    if available is None:
        available = set(SAFE_STDLIB) | set(COMMON_SCIENCE) | set(sys.modules.keys())

    imports = extract_imports(code)
    for mod in sorted(imports):
        if mod not in available:
            result.issues.append(
                ValidationIssue(
                    severity="warning",
                    category="import",
                    message=f"Module '{mod}' may not be available in sandbox",
                )
            )
    return result


def validate_code(
    code: str,
    *,
    available_packages: set[str] | None = None,
    skip_security: bool = False,
    skip_imports: bool = False,
) -> CodeValidation:
    """Run all validations and return a combined :class:`CodeValidation`.

    1. Syntax check (always)
    2. Security scan (unless *skip_security*)
    3. Import availability (unless *skip_imports*)
    """
    combined = CodeValidation()

    # 1. Syntax
    syntax = validate_syntax(code)
    combined.issues.extend(syntax.issues)
    if not syntax.ok:
        # No point running further checks if code doesn't parse
        return combined

    # 2. Security
    if not skip_security:
        security = validate_security(code)
        combined.issues.extend(security.issues)

    # 3. Import availability
    if not skip_imports:
        imp = validate_imports(code, available=available_packages)
        combined.issues.extend(imp.issues)

    return combined


# ---------------------------------------------------------------------------
# Error description helper (for LLM repair prompt)
# ---------------------------------------------------------------------------


def format_issues_for_llm(validation: CodeValidation) -> str:
    """Format validation issues as a concise error report for LLM repair."""
    if validation.ok and not validation.warnings:
        return "No issues found."
    lines: list[str] = []
    for issue in validation.issues:
        loc = f"line {issue.line}" if issue.line else "unknown location"
        lines.append(
            f"- [{issue.severity.upper()}] ({issue.category}) {issue.message} @ {loc}"
        )
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Code complexity and quality checks (R10-Fix6)
# ---------------------------------------------------------------------------


def check_code_complexity(code: str) -> list[str]:
    """Check whether generated experiment code is too simplistic.

    Returns a list of warning strings.  Empty list means no quality concerns.
    """
    warnings: list[str] = []

    # Count non-blank, non-comment, non-import lines
    effective_lines = [
        l
        for l in code.splitlines()
        if l.strip()
        and not l.strip().startswith("#")
        and not l.strip().startswith(("import ", "from "))
    ]

    if len(effective_lines) < 10:
        warnings.append(
            f"Code has only {len(effective_lines)} effective lines "
            f"(excluding blanks/comments/imports) — likely too simple for "
            f"a research experiment"
        )

    # Check for trivially short functions/methods
    try:
        tree = ast.parse(code)
        func_count = 0
        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                func_count += 1
        if func_count == 0 and len(effective_lines) > 5:
            warnings.append(
                "Code has no function definitions — research experiments "
                "should be structured with reusable functions"
            )
    except SyntaxError:
        pass

    # Check for hardcoded metrics (a common LLM failure mode)
    import re

    hardcoded_patterns = [
        (r"print\(['\"].*:\s*0\.\d+['\"]\)", "print statement with hardcoded metric value"),
        (r"metric.*=\s*0\.\d{2,}", "hardcoded metric assignment"),
    ]
    for pattern, desc in hardcoded_patterns:
        if re.search(pattern, code):
            warnings.append(f"Possible hardcoded metric: {desc}")

    # Check for trivial computation patterns
    trivial_patterns = [
        ("sum(x**2)", "trivial sum-of-squares computation"),
        ("np.sum(x**2)", "trivial sum-of-squares computation"),
        ("0.3 + idx * 0.03", "formulaic/simulated metric generation"),
    ]
    for pattern, desc in trivial_patterns:
        if pattern in code:
            warnings.append(f"Trivial computation detected: {desc}")

    return warnings


# ---------------------------------------------------------------------------
# Deep code quality analysis (Phase 1 / P1.1 + P1.2)
# ---------------------------------------------------------------------------


def check_class_quality(all_files: dict[str, str]) -> list[str]:
    """Analyze class implementations across all experiment files.

    Detects:
    - Empty or trivial class inheritance (class B(A): pass)
    - Classes with too few methods (< 2 non-dunder)
    - Duplicate class bodies (identical forward/train logic across variants)
    - nn.Module created inside forward() instead of __init__()
    """
    warnings: list[str] = []

    class_info: dict[str, dict[str, Any]] = {}

    for fname, code in all_files.items():
        if not fname.endswith(".py"):
            continue
        try:
            tree = ast.parse(code)
        except SyntaxError:
            continue

        for node in ast.walk(tree):
            if not isinstance(node, ast.ClassDef):
                continue

            cls_name = node.name
            methods: list[str] = []
            method_sources: dict[str, str] = {}
            has_forward_new_module = False
            body_lines = 0

            for item in ast.walk(node):
                if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
                    methods.append(item.name)
                    # Approximate method body size
                    m_start = item.lineno
                    m_end = item.end_lineno or item.lineno
                    body_len = m_end - m_start
                    method_sources[item.name] = f"{fname}:{m_start}-{m_end}"

                    # Check for nn.Module creation inside forward()
                    if item.name in ("forward", "__call__"):
                        for sub in ast.walk(item):
                            if isinstance(sub, ast.Call):
                                call_name = _resolve_call_name(sub.func)
                                if call_name.startswith("nn.") and call_name != "nn.Module":
                                    has_forward_new_module = True

            # Count effective body lines
            code_lines = code.splitlines()
            if node.end_lineno and node.lineno:
                cls_body = code_lines[node.lineno - 1 : node.end_lineno]
                body_lines = sum(
                    1 for l in cls_body
                    if l.strip() and not l.strip().startswith("#")
                    and not l.strip().startswith(("import ", "from "))
                )

            non_dunder = [m for m in methods if not m.startswith("__")]

            class_info[f"{fname}:{cls_name}"] = {
                "methods": methods,
                "non_dunder": non_dunder,
                "body_lines": body_lines,
                "file": fname,
                "has_forward_new_module": has_forward_new_module,
            }

            # --- Check 1: Empty or trivial class ---
            if body_lines <= 2:
                warnings.append(
                    f"[{fname}] Class '{cls_name}' has only {body_lines} body lines "
                    f"— likely an empty or trivial subclass (class B(A): pass)"
                )

            # --- Check 2: Too few methods for an algorithm class ---
            if body_lines > 5 and len(non_dunder) < 2:
                warnings.append(
                    f"[{fname}] Class '{cls_name}' has only {len(non_dunder)} "
                    f"non-dunder method(s) — algorithm classes should have at "
                    f"least __init__ + one core method (forward/train_step/predict)"
                )

            # --- Check 3: nn.Module created in forward() ---
            if has_forward_new_module:
                warnings.append(
                    f"[{fname}] Class '{cls_name}' creates nn.Module (nn.Linear etc.) "
                    f"inside forward() — these modules are unregistered and untrained. "
                    f"Move to __init__() and register as submodules."
                )

    # --- Check 4: Duplicate class implementations ---
    # Compare class body hashes to find copy-paste variants
    class_names = list(class_info.keys())
    for i, name_a in enumerate(class_names):
        info_a = class_info[name_a]
        for name_b in class_names[i + 1:]:
            info_b = class_info[name_b]
            if (
                info_a["body_lines"] > 5
                and info_b["body_lines"] > 5
                and info_a["non_dunder"] == info_b["non_dunder"]
                and abs(info_a["body_lines"] - info_b["body_lines"]) <= 2
            ):
                # Same methods, same body size — likely duplicates
                warnings.append(
                    f"Classes '{name_a.split(':')[1]}' and '{name_b.split(':')[1]}' "
                    f"have identical method signatures and similar body sizes "
                    f"({info_a['body_lines']} vs {info_b['body_lines']} lines) — "
                    f"may be copy-paste variants with no real algorithmic difference"
                )

    # --- Check 5: Ablation subclasses must override with different logic ---
    # Parse inheritance relationships and compare method ASTs
    for fname_code, code in all_files.items():
        if not fname_code.endswith(".py"):
            continue
        try:
            tree = ast.parse(code)
        except SyntaxError:
            continue

        # Build {class_name: ClassDef} map for this file
        file_classes: dict[str, ast.ClassDef] = {}
        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef):
                file_classes[node.name] = node

        for cls_name, cls_node in file_classes.items():
            # Check if this class inherits from another class in the same file
            for base in cls_node.bases:
                base_name = None
                if isinstance(base, ast.Name):
                    base_name = base.id
                elif isinstance(base, ast.Attribute):
                    base_name = base.attr
                if not base_name or base_name not in file_classes:
                    continue

                parent_node = file_classes[base_name]
                # Get method bodies as AST dumps for comparison
                child_methods = {
                    m.name: ast.dump(m)
                    for m in cls_node.body
                    if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef))
                    and not m.name.startswith("__")
                }
                parent_methods = {
                    m.name: ast.dump(m)
                    for m in parent_node.body
                    if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef))
                    and not m.name.startswith("__")
                }

                if not child_methods:
                    # Already caught by Check 1 (empty class)
                    continue

                # Check if all overridden methods have identical AST to parent
                identical_count = 0
                override_count = 0
                for method_name, method_dump in child_methods.items():
                    if method_name in parent_methods:
                        override_count += 1
                        if method_dump == parent_methods[method_name]:
                            identical_count += 1

                if override_count > 0 and identical_count == override_count:
                    warnings.append(
                        f"[{fname_code}] Class '{cls_name}' inherits from "
                        f"'{base_name}' and overrides {override_count} method(s), "
                        f"but ALL overridden methods have identical AST to parent "
                        f"— this is NOT a real ablation. Methods must differ."
                    )
                elif override_count == 0 and len(child_methods) > 0:
                    # Has methods but none override parent — might be fine
                    # (new methods that parent doesn't have)
                    pass

                # --- Check 6: Ablation subclass must override >=1 parent method ---
                _lname = cls_name.lower()
                if ("ablation" in _lname or "no_" in _lname or "without" in _lname):
                    parent_non_dunder = {
                        m.name
                        for m in parent_node.body
                        if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef))
                        and not m.name.startswith("__")
                    }
                    child_overrides = set(child_methods.keys()) & parent_non_dunder
                    if not child_overrides and parent_non_dunder:
                        warnings.append(
                            f"[{fname_code}] Ablation class '{cls_name}' inherits "
                            f"from '{base_name}' but does NOT override any of its "
                            f"methods ({', '.join(sorted(parent_non_dunder))}). "
                            f"An ablation MUST override the method that removes "
                            f"the ablated component."
                        )

    return warnings


def check_variable_scoping(code: str, fname: str = "main.py") -> list[str]:
    """Detect common variable scoping bugs in experiment code.

    Catches the pattern where a variable is defined inside an if-branch
    but used outside that branch (UnboundLocalError at runtime).
    """
    warnings: list[str] = []

    try:
        tree = ast.parse(code)
    except SyntaxError:
        return warnings

    for node in ast.walk(tree):
        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            continue

        # Collect variables assigned only inside if/elif/else branches
        if_only_vars: dict[str, int] = {}
        top_level_vars: set[str] = set()

        for child in ast.iter_child_nodes(node):
            if isinstance(child, ast.If):
                _collect_if_only_assignments(child, if_only_vars)
            elif isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)):
                for target in _extract_assign_targets(child):
                    top_level_vars.add(target)

        # Check for variables used after the if block but only defined inside it
        for var_name, var_line in if_only_vars.items():
            if var_name not in top_level_vars:
                # Check if this variable is used later in the function
                for later_node in ast.walk(node):
                    if (
                        isinstance(later_node, ast.Name)
                        and later_node.id == var_name
                        and isinstance(later_node.ctx, ast.Load)
                        and later_node.lineno > var_line
                    ):
                        warnings.append(
                            f"[{fname}:{var_line}] Variable '{var_name}' is assigned "
                            f"only inside an if-branch but used at line "
                            f"{later_node.lineno} — will cause UnboundLocalError "
                            f"if the branch is not taken"
                        )
                        break

    return warnings


def _collect_if_only_assignments(
    if_node: ast.If, result: dict[str, int]
) -> None:
    """Collect variables assigned only inside if/elif branches."""
    for child in ast.iter_child_nodes(if_node):
        if isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)):
            for target in _extract_assign_targets(child):
                result[target] = child.lineno
        elif isinstance(child, ast.If):
            _collect_if_only_assignments(child, result)


def _extract_assign_targets(node: ast.AST) -> list[str]:
    """Extract variable names from assignment targets."""
    names: list[str] = []
    if isinstance(node, ast.Assign):
        for target in node.targets:
            if isinstance(target, ast.Name):
                names.append(target.id)
    elif isinstance(node, ast.AugAssign):
        if isinstance(node.target, ast.Name):
            names.append(node.target.id)
    elif isinstance(node, ast.AnnAssign):
        if isinstance(node.target, ast.Name):
            names.append(node.target.id)
    return names


def auto_fix_unbound_locals(code: str) -> tuple[str, int]:
    """Programmatically fix UnboundLocalError patterns.

    For each variable assigned only inside an if-branch but used later,
    insert ``var = None`` before the if-statement.

    Returns (fixed_code, num_fixes).
    """
    try:
        tree = ast.parse(code)
    except SyntaxError:
        return code, 0

    lines = code.splitlines(keepends=True)
    insertions: dict[int, list[str]] = {}  # lineno -> lines to insert before

    for node in ast.walk(tree):
        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            continue

        if_only_vars: dict[str, int] = {}
        top_level_vars: set[str] = set()
        if_line_map: dict[str, int] = {}  # var -> if-statement lineno

        for child in ast.iter_child_nodes(node):
            if isinstance(child, ast.If):
                before: dict[str, int] = {}
                _collect_if_only_assignments(child, before)
                for var_name, var_line in before.items():
                    if_only_vars[var_name] = var_line
                    if_line_map[var_name] = child.lineno
            elif isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)):
                for target in _extract_assign_targets(child):
                    top_level_vars.add(target)

        for var_name, var_line in if_only_vars.items():
            if var_name in top_level_vars:
                continue
            # Confirm it's actually used later
            used_later = False
            for later_node in ast.walk(node):
                if (
                    isinstance(later_node, ast.Name)
                    and later_node.id == var_name
                    and isinstance(later_node.ctx, ast.Load)
                    and later_node.lineno > var_line
                ):
                    used_later = True
                    break
            if not used_later:
                continue

            if_lineno = if_line_map.get(var_name)
            if if_lineno is None:
                continue
            # Determine indentation of the if-statement
            if if_lineno <= len(lines):
                if_line = lines[if_lineno - 1]
                indent = if_line[: len(if_line) - len(if_line.lstrip())]
            else:
                indent = "    "
            insertions.setdefault(if_lineno, [])
            fix_line = f"{indent}{var_name} = None\n"
            if fix_line not in insertions[if_lineno]:
                insertions[if_lineno].append(fix_line)

    if not insertions:
        return code, 0

    # Apply insertions in reverse line order to keep line numbers stable
    num_fixes = sum(len(v) for v in insertions.values())
    for lineno in sorted(insertions, reverse=True):
        idx = lineno - 1
        for fix_line in reversed(insertions[lineno]):
            lines.insert(idx, fix_line)

    return "".join(lines), num_fixes


def check_api_correctness(code: str, fname: str = "main.py") -> list[str]:
    """Detect common API misuse patterns.

    Catches:
    - np.erf() (should be scipy.special.erf)
    - nn.Linear/nn.Conv2d inside forward() (unregistered module)
    - random.seed() without numpy.random.seed() (incomplete seeding)
    - NumPy 2.0 removed APIs (.ptp(), np.bool, etc.)
    """
    import re as _re

    warnings: list[str] = []

    lines = code.splitlines()
    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if stripped.startswith("#"):
            continue

        # np.erf doesn't exist
        if _re.search(r"\bnp\.erf\b", stripped):
            warnings.append(
                f"[{fname}:{i}] np.erf() does not exist — use "
                f"scipy.special.erf() or math.erf() instead"
            )

        # NumPy 2.0 removed ndarray methods
        if _re.search(r"\.ptp\s*\(", stripped):
            warnings.append(
                f"[{fname}:{i}] ndarray.ptp() was removed in NumPy 2.0 — "
                f"use np.ptp(arr) or arr.max() - arr.min() instead"
            )

        # NumPy 2.0 removed type aliases
        for old_alias in ("np.bool", "np.int", "np.float", "np.complex",
                          "np.object", "np.str"):
            pattern = _re.escape(old_alias) + r"(?![_\w\d])"
            if _re.search(pattern, stripped):
                warnings.append(
                    f"[{fname}:{i}] {old_alias} was removed in NumPy 2.0 — "
                    f"use {old_alias}_ or Python builtin instead"
                )

        # np.random.RandomState with hardcoded seed in a function called multiple times
        if _re.search(r"RandomState\(\s*\d+\s*\)", stripped) and "def " not in stripped:
            warnings.append(
                f"[{fname}:{i}] Hardcoded RandomState seed inside a loop/function "
                f"may produce identical results across calls — pass seed as parameter"
            )

    # --- Import-usage mismatch detection ---
    # Detect `from X import Y` followed by `X.Y(...)` — guaranteed NameError
    import_from_map: dict[str, set[str]] = {}  # module -> {names}
    import_module_set: set[str] = set()  # modules imported with `import X`
    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        m = _re.match(r"from\s+([\w.]+)\s+import\s+(.+)", stripped)
        if m:
            mod = m.group(1)
            names = {n.strip().split(" as ")[-1].strip()
                     for n in m.group(2).split(",")}
            import_from_map.setdefault(mod, set()).update(names)
        elif _re.match(r"import\s+([\w.]+)", stripped) and "from" not in stripped:
            m2 = _re.match(r"import\s+([\w.]+)", stripped)
            if m2:
                import_module_set.add(m2.group(1).split(".")[0])

    # Now scan for qualified calls to modules that were only from-imported
    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if stripped.startswith("#"):
            continue
        for mod, _names in import_from_map.items():
            top_mod = mod.split(".")[0]
            # Only flag if the module was NOT also imported via `import X`
            if top_mod in import_module_set:
                continue
            # Check for `module.name(...)` usage when `name` was from-imported
            for name in _names:
                pattern = _re.escape(f"{mod}.{name}") + r"\s*\("
                if _re.search(pattern, stripped):
                    warnings.append(
                        f"[{fname}:{i}] Import-usage mismatch: '{name}' was imported "
                        f"via `from {mod} import {name}` but called as `{mod}.{name}()` "
                        f"— this will raise NameError. Use `{name}()` directly."
                    )

    return warnings


def check_undefined_calls(code: str, fname: str = "main.py") -> list[str]:
    """Detect calls to undefined functions/names in experiment code.

    Catches the pattern where a function is called but never defined or imported,
    which would cause NameError at runtime.
    """
    warnings: list[str] = []

    try:
        tree = ast.parse(code)
    except SyntaxError:
        return warnings

    # Common builtins that are always available
    builtins = {
        "print", "len", "range", "enumerate", "zip", "map", "filter", "sorted",
        "list", "dict", "set", "tuple", "str", "int", "float", "bool", "bytes",
        "type", "isinstance", "issubclass", "hasattr", "getattr", "setattr",
        "delattr", "callable", "iter", "next", "reversed", "slice", "super",
        "property", "staticmethod", "classmethod", "abs", "all", "any", "bin",
        "chr", "ord", "hex", "oct", "pow", "round", "sum", "min", "max", "open",
        "input", "repr", "hash", "id", "dir", "vars", "globals", "locals",
        "format", "ascii", "object", "Exception", "ValueError", "TypeError",
        "KeyError", "IndexError", "AttributeError", "RuntimeError", "StopIteration",
        "NotImplementedError", "AssertionError", "ImportError", "FileNotFoundError",
        "OSError", "IOError", "ZeroDivisionError", "OverflowError", "MemoryError",
        "RecursionError", "SystemExit", "KeyboardInterrupt", "GeneratorExit",
        "BaseException", "Warning", "DeprecationWarning", "UserWarning",
        "FutureWarning", "PendingDeprecationWarning", "SyntaxWarning",
        "RuntimeWarning", "ResourceWarning", "BytesWarning", "UnicodeWarning",
        "breakpoint", "memoryview", "bytearray", "frozenset", "complex",
        "divmod", "eval", "exec", "compile", "__import__", "help", "exit", "quit",
    }

    # Collect all defined names in the module
    defined_names: set[str] = set()

    for node in ast.walk(tree):
        # Function definitions
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            defined_names.add(node.name)
        # Class definitions
        elif isinstance(node, ast.ClassDef):
            defined_names.add(node.name)
        # Imports
        elif isinstance(node, ast.Import):
            for alias in node.names:
                name = alias.asname if alias.asname else alias.name.split(".")[0]
                defined_names.add(name)
        elif isinstance(node, ast.ImportFrom):
            for alias in node.names:
                name = alias.asname if alias.asname else alias.name
                if name != "*":
                    defined_names.add(name)
        # Assignments (including comprehensions)
        elif isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name):
                    defined_names.add(target.id)
                elif isinstance(target, ast.Tuple):
                    for elt in target.elts:
                        if isinstance(elt, ast.Name):
                            defined_names.add(elt.id)
        elif isinstance(node, ast.AnnAssign):
            if isinstance(node.target, ast.Name):
                defined_names.add(node.target.id)
        elif isinstance(node, ast.AugAssign):
            if isinstance(node.target, ast.Name):
                defined_names.add(node.target.id)
        # For loop targets
        elif isinstance(node, ast.For):
            if isinstance(node.target, ast.Name):
                defined_names.add(node.target.id)
            elif isinstance(node.target, ast.Tuple):
                for elt in node.target.elts:
                    if isinstance(elt, ast.Name):
                        defined_names.add(elt.id)
        # With statement targets
        elif isinstance(node, ast.With):
            for item in node.items:
                if item.optional_vars and isinstance(item.optional_vars, ast.Name):
                    defined_names.add(item.optional_vars.id)
        # Exception handlers
        elif isinstance(node, ast.ExceptHandler):
            if node.name:
                defined_names.add(node.name)
        # Named expressions (walrus operator)
        elif isinstance(node, ast.NamedExpr):
            if isinstance(node.target, ast.Name):
                defined_names.add(node.target.id)

    # Also collect function parameters
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            for arg in node.args.args:
                defined_names.add(arg.arg)
            for arg in node.args.posonlyargs:
                defined_names.add(arg.arg)
            for arg in node.args.kwonlyargs:
                defined_names.add(arg.arg)
            if node.args.vararg:
                defined_names.add(node.args.vararg.arg)
            if node.args.kwarg:
                defined_names.add(node.args.kwarg.arg)

    # Now find all function calls to bare names (not attributes like obj.method())
    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            # Only check bare name calls, not attribute calls (obj.method())
            if isinstance(node.func, ast.Name):
                call_name = node.func.id
                if (
                    call_name not in defined_names
                    and call_name not in builtins
                ):
                    warnings.append(
                        f"[{fname}:{node.lineno}] Call to undefined function "
                        f"'{call_name}()' — this will raise NameError at runtime. "
                        f"Either define the function or remove the call."
                    )

    return warnings


def check_filename_collisions(files: dict[str, str]) -> list[str]:
    """BUG-202: Detect local .py filenames that shadow pip/stdlib packages.

    The LLM commonly generates ``config.py``, ``models.py``, etc. which get
    shadowed by pip-installed packages (e.g. ``pip install config``).  The
    result is an import crash at runtime.
    """
    # Filenames (without .py) that are known to collide with pip/stdlib packages.
    _SHADOW_RISK: set[str] = {
        # pip packages frequently installed as transitive deps
        "config", "test", "tests", "types", "typing_extensions",
        # stdlib modules the LLM might accidentally shadow
        "io", "logging", "json", "time", "random", "copy", "math",
        "os", "sys", "collections", "functools", "abc", "re",
        "statistics", "signal", "pickle", "itertools",
        "string", "tokenize", "token", "email", "calendar",
        "numbers", "operator", "queue", "code", "profile",
    }
    warnings: list[str] = []
    for fname in files:
        stem = fname.removesuffix(".py") if fname.endswith(".py") else None
        if stem and stem in _SHADOW_RISK:
            warnings.append(
                f"[{fname}] Filename shadows stdlib/pip package '{stem}'. "
                f"Rename to e.g. '{stem}_config.py' or 'experiment_{stem}.py' "
                f"to avoid import collisions at runtime."
            )
    return warnings


def deep_validate_files(
    files: dict[str, str],
) -> list[str]:
    """Run all deep quality checks across all experiment files.

    Returns a list of warning strings. Empty = no concerns.
    """
    warnings: list[str] = []
    warnings.extend(check_class_quality(files))
    warnings.extend(check_filename_collisions(files))
    for fname, code in files.items():
        if not fname.endswith(".py"):
            continue
        warnings.extend(check_variable_scoping(code, fname))
        warnings.extend(check_api_correctness(code, fname))
    return warnings


================================================
FILE: researchclaw/experiment/visualize.py
================================================
"""Experiment result visualization.

Generates publication-quality charts from experiment run data:
- Condition comparison (grouped bar chart with CI error bars)
- Metric heatmap (condition × metric matrix)
- Metric trajectory (line chart with markers)
- Ablation delta chart (horizontal bar showing delta from baseline)
- Pipeline execution timeline
- Iteration score history

Uses Paul Tol colorblind-safe palette and academic styling.
"""

from __future__ import annotations

import json
import logging
import math
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

try:
    import matplotlib

    matplotlib.use("Agg")  # Non-interactive backend
    import matplotlib.pyplot as plt
    import matplotlib.colors as mcolors
    from matplotlib.patches import FancyBboxPatch
    import numpy as np

    HAS_MATPLOTLIB = True
except ImportError:
    HAS_MATPLOTLIB = False

# Paul Tol "bright" palette — colorblind-safe, publication-ready
_PAUL_TOL_BRIGHT = [
    "#4477AA",  # blue
    "#EE6677",  # red/pink
    "#228833",  # green
    "#CCBB44",  # yellow
    "#66CCEE",  # cyan
    "#AA3377",  # purple
    "#BBBBBB",  # grey
]

# Extended palette for many conditions (Tol muted + bright merged)
_PAUL_TOL_EXTENDED = [
    "#4477AA", "#EE6677", "#228833", "#CCBB44", "#66CCEE",
    "#AA3377", "#332288", "#88CCEE", "#44AA99", "#117733",
    "#999933", "#CC6677", "#882255", "#661100", "#6699CC",
]

# Metrics to exclude from comparison charts (timing, meta, non-scientific)
_EXCLUDED_METRICS: set[str] = {
    "time_budget_sec", "elapsed_sec", "elapsed_time", "execution_time",
    "wall_time", "runtime_sec", "total_time", "timeout",
    "seed", "seed_count", "n_seeds", "num_seeds",
    "success_rate", "num_conditions", "total_conditions",
    "calibration_iterations",
}

# Prefixes that indicate meta/timing metrics
_EXCLUDED_PREFIXES: tuple[str, ...] = ("time_", "runtime_", "elapsed_", "wall_")


def _is_excluded_metric(name: str) -> bool:
    """Return True if *name* is a timing/meta metric that shouldn't be charted."""
    low = name.lower()
    if low in _EXCLUDED_METRICS:
        return True
    return any(low.startswith(p) for p in _EXCLUDED_PREFIXES)


def _shorten_label(name: str, max_len: int = 22) -> str:
    """Shorten a metric label for chart readability."""
    if len(name) <= max_len:
        return name
    return name[: max_len - 1] + "\u2026"


def _format_cond_name(name: str) -> str:
    """Format condition name for display: underscores → spaces, title case."""
    return name.replace("_", " ").title()


def _ensure_dir(path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    return path


def _setup_academic_style() -> None:
    """Apply academic styling via rcParams."""
    plt.rcParams.update({
        "font.family": "serif",
        "font.size": 11,
        "axes.labelsize": 12,
        "axes.titlesize": 13,
        "axes.titleweight": "bold",
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "figure.dpi": 300,
        "savefig.dpi": 300,
        "savefig.bbox": "tight",
        "axes.spines.top": False,
        "axes.spines.right": False,
        "axes.grid": True,
        "grid.alpha": 0.3,
        "grid.linestyle": "--",
        "axes.axisbelow": True,
    })


# ---------------------------------------------------------------------------
# 1. Condition comparison — grouped bar with CI error bars
# ---------------------------------------------------------------------------


def plot_condition_comparison(
    condition_summaries: dict[str, dict[str, Any]],
    output_path: Path,
    *,
    metric_key: str = "primary_metric",
    title: str = "",
) -> Path | None:
    """Bar chart comparing conditions with mean +/- 95% CI error bars.

    Uses Paul Tol colorblind-safe palette with gradient shading.
    """
    if not HAS_MATPLOTLIB or not condition_summaries:
        return None

    _setup_academic_style()

    names: list[str] = []
    means: list[float] = []
    ci_low: list[float] = []
    ci_high: list[float] = []

    for cond, info in condition_summaries.items():
        m = info.get("metrics", {})
        mean_val = m.get(f"{metric_key}_mean") or m.get(metric_key)
        if mean_val is None:
            continue
        fmean = float(mean_val)
        names.append(_format_cond_name(cond))
        means.append(fmean)
        ci_low.append(float(info.get("ci95_low", fmean)))
        ci_high.append(float(info.get("ci95_high", fmean)))

    if not names:
        return None

    yerr_lo = [max(0, m - lo) for m, lo in zip(means, ci_low)]
    yerr_hi = [max(0, hi - m) for m, hi in zip(means, ci_high)]

    n = len(names)
    colors = [_PAUL_TOL_EXTENDED[i % len(_PAUL_TOL_EXTENDED)] for i in range(n)]

    fig, ax = plt.subplots(figsize=(max(7, n * 1.2), 5))
    x = np.arange(n)
    bars = ax.bar(
        x, means, color=colors, alpha=0.88,
        edgecolor="white", linewidth=0.8, width=0.7,
    )
    ax.errorbar(
        x, means, yerr=[yerr_lo, yerr_hi],
        fmt="none", ecolor="#333333", capsize=5, capthick=1.5, linewidth=1.5,
    )

    # Value labels above bars
    y_max = max(m + h for m, h in zip(means, yerr_hi)) if yerr_hi else max(means)
    offset = y_max * 0.025
    for i, m in enumerate(means):
        ax.text(
            i, m + yerr_hi[i] + offset, f"{m:.3f}",
            ha="center", va="bottom", fontsize=9, fontweight="bold", color="#333",
        )

    ax.set_xlabel("Method / Condition")
    metric_label = metric_key.replace("_", " ").title()
    ax.set_ylabel(metric_label)
    ax.set_title(title or f"{metric_label} Comparison (Mean \u00b1 95% CI)")
    ax.set_xticks(x)
    ax.set_xticklabels(names, rotation=30, ha="right")
    ax.set_ylim(bottom=0)
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved condition comparison: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 2. Metric heatmap — condition × metric matrix
# ---------------------------------------------------------------------------


def plot_metric_heatmap(
    condition_summaries: dict[str, dict[str, Any]],
    output_path: Path,
    *,
    title: str = "Performance Heatmap (Per-Condition Metrics)",
    max_metrics: int = 12,
) -> Path | None:
    """Heatmap of normalized metric values across conditions.

    Shows per-condition performance normalized to [0, 1] per metric column.
    """
    if not HAS_MATPLOTLIB or not condition_summaries:
        return None

    _setup_academic_style()

    # Collect all metric keys across conditions
    all_metric_keys: set[str] = set()
    for info in condition_summaries.values():
        m = info.get("metrics", {})
        for k in m:
            if not _is_excluded_metric(k) and not k.endswith("_std"):
                all_metric_keys.add(k)

    # Filter to _mean variants or raw metrics, deduplicate
    clean_keys: list[str] = []
    for k in sorted(all_metric_keys):
        base = k.replace("_mean", "")
        if base not in [ck.replace("_mean", "") for ck in clean_keys]:
            clean_keys.append(k)

    if len(clean_keys) < 2:
        return None

    clean_keys = clean_keys[:max_metrics]
    cond_names = list(condition_summaries.keys())

    # Build matrix
    data = np.zeros((len(cond_names), len(clean_keys)))
    for i, cond in enumerate(cond_names):
        m = condition_summaries[cond].get("metrics", {})
        for j, mk in enumerate(clean_keys):
            val = m.get(mk, 0)
            try:
                data[i, j] = float(val)
            except (ValueError, TypeError):
                data[i, j] = 0

    # Normalize per column (min-max)
    for j in range(data.shape[1]):
        col = data[:, j]
        lo, hi = col.min(), col.max()
        if hi > lo:
            data[:, j] = (col - lo) / (hi - lo)
        else:
            data[:, j] = 0.5

    fig, ax = plt.subplots(
        figsize=(max(6, len(clean_keys) * 1.3), max(4, len(cond_names) * 0.6))
    )
    im = ax.imshow(data, cmap="YlGnBu", aspect="auto", vmin=0, vmax=1)

    # Annotate cells
    for i in range(len(cond_names)):
        for j in range(len(clean_keys)):
            val = data[i, j]
            color = "white" if val > 0.6 else "#333"
            ax.text(j, i, f"{val:.2f}", ha="center", va="center",
                    fontsize=8, color=color, fontweight="bold")

    ax.set_xticks(range(len(clean_keys)))
    ax.set_xticklabels(
        [_shorten_label(k.replace("_mean", "").replace("_", " ")) for k in clean_keys],
        rotation=35, ha="right", fontsize=9,
    )
    ax.set_yticks(range(len(cond_names)))
    ax.set_yticklabels([_format_cond_name(c) for c in cond_names], fontsize=9)
    ax.set_title(title)

    cbar = fig.colorbar(im, ax=ax, shrink=0.8, label="Normalized Score")
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved metric heatmap: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 3. Ablation delta chart — horizontal bars showing improvement over baseline
# ---------------------------------------------------------------------------


def plot_ablation_deltas(
    condition_summaries: dict[str, dict[str, Any]],
    output_path: Path,
    *,
    metric_key: str = "primary_metric",
    baseline_name: str = "",
    title: str = "",
    higher_is_better: bool = True,
) -> Path | None:
    """Horizontal bar chart showing delta from baseline for each ablation.

    Bars go left (worse) or right (better) from zero.
    """
    if not HAS_MATPLOTLIB or not condition_summaries:
        return None

    _setup_academic_style()

    # Find baseline
    cond_keys = list(condition_summaries.keys())
    if baseline_name:
        base_key = baseline_name
    else:
        # Heuristic: pick "baseline", "heuristic_baseline", or first condition
        for candidate in ["baseline", "heuristic_baseline", "random_baseline"]:
            if candidate in cond_keys:
                base_key = candidate
                break
        else:
            base_key = cond_keys[0]

    base_info = condition_summaries.get(base_key, {})
    base_m = base_info.get("metrics", {})
    base_val = float(base_m.get(f"{metric_key}_mean") or base_m.get(metric_key, 0))

    if base_val == 0:
        return None

    names: list[str] = []
    deltas: list[float] = []
    for cond, info in condition_summaries.items():
        if cond == base_key:
            continue
        m = info.get("metrics", {})
        val = m.get(f"{metric_key}_mean") or m.get(metric_key)
        if val is None:
            continue
        fval = float(val)
        pct = ((fval - base_val) / abs(base_val)) * 100
        names.append(_format_cond_name(cond))
        deltas.append(pct)

    if not names:
        return None

    # Sort by delta
    pairs = sorted(zip(deltas, names), reverse=True)
    deltas, names = zip(*pairs)
    deltas = list(deltas)
    names = list(names)

    fig, ax = plt.subplots(figsize=(8, max(4, len(names) * 0.5)))
    y = np.arange(len(names))
    bar_colors = []
    for d in deltas:
        if higher_is_better:
            bar_colors.append("#228833" if d > 0 else "#EE6677")
        else:
            bar_colors.append("#228833" if d < 0 else "#EE6677")

    ax.barh(y, deltas, color=bar_colors, alpha=0.85, edgecolor="white", height=0.6)
    ax.axvline(x=0, color="#333", linewidth=1, linestyle="-")

    # Value labels
    for i, d in enumerate(deltas):
        ha = "left" if d >= 0 else "right"
        offset = 0.5 if d >= 0 else -0.5
        ax.text(d + offset, i, f"{d:+.1f}%", ha=ha, va="center", fontsize=9, fontweight="bold")

    ax.set_yticks(y)
    ax.set_yticklabels(names)
    ax.set_xlabel(f"\u0394 {metric_key.replace('_', ' ').title()} vs. Baseline (%)")
    ax.set_title(title or f"Ablation Analysis (Baseline: {_format_cond_name(base_key)})")
    ax.invert_yaxis()
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved ablation deltas: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 4. Metric trajectory — line chart across refinement iterations
# ---------------------------------------------------------------------------


def plot_metric_trajectory(
    runs: list[dict[str, Any]],
    metric_key: str,
    output_path: Path,
    *,
    title: str = "",
) -> Path | None:
    """Plot metric values across runs as a styled line chart with markers."""
    if not HAS_MATPLOTLIB or not runs:
        return None

    _setup_academic_style()

    values: list[float] = []
    labels: list[str] = []
    for i, r in enumerate(runs):
        m = r.get("metrics") or r.get("key_metrics") or {}
        if isinstance(m, dict) and metric_key in m:
            try:
                values.append(float(m[metric_key]))
                labels.append(r.get("run_id", f"Iter {i + 1}"))
            except (ValueError, TypeError):
                continue

    if not values:
        return None

    fig, ax = plt.subplots(figsize=(max(6, len(values) * 1.5), 4.5))
    x = range(len(values))
    ax.plot(
        x, values, "o-", color=_PAUL_TOL_BRIGHT[0],
        linewidth=2.5, markersize=8, markerfacecolor="white",
        markeredgewidth=2, markeredgecolor=_PAUL_TOL_BRIGHT[0],
    )

    # Fill area under curve
    ax.fill_between(x, values, alpha=0.08, color=_PAUL_TOL_BRIGHT[0])

    # Value annotations
    for i, v in enumerate(values):
        ax.annotate(
            f"{v:.4f}", (i, v), textcoords="offset points",
            xytext=(0, 12), ha="center", fontsize=9, fontweight="bold",
        )

    metric_label = metric_key.replace("_", " ").title()
    ax.set_xlabel("Refinement Iteration")
    ax.set_ylabel(metric_label)
    ax.set_title(title or f"{metric_label} Across Iterations")
    ax.set_xticks(list(x))
    ax.set_xticklabels(
        [_shorten_label(lb, 15) for lb in labels],
        rotation=30, ha="right",
    )
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved metric trajectory: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 5. Experiment comparison — multi-metric bar chart
# ---------------------------------------------------------------------------


def plot_experiment_comparison(
    metrics_summary: dict[str, dict[str, float]],
    output_path: Path,
    *,
    title: str = "Experiment Results Comparison",
) -> Path | None:
    """Grouped bar chart comparing mean/min/max across metrics."""
    if not HAS_MATPLOTLIB or not metrics_summary:
        return None

    _setup_academic_style()

    filtered = {k: v for k, v in metrics_summary.items() if not _is_excluded_metric(k)}
    if not filtered:
        return None

    # Limit to top 12 metrics
    if len(filtered) > 12:
        top = sorted(filtered.items(), key=lambda kv: abs(kv[1].get("mean", 0)), reverse=True)[:12]
        filtered = dict(top)

    names = list(filtered.keys())
    means = [filtered[n].get("mean", 0) for n in names]
    mins = [filtered[n].get("min", 0) for n in names]
    maxs = [filtered[n].get("max", 0) for n in names]

    fig, ax = plt.subplots(figsize=(max(7, len(names) * 1.3), 5))
    x = np.arange(len(names))
    width = 0.6

    bars = ax.bar(
        x, means, width=width, color=_PAUL_TOL_BRIGHT[0], alpha=0.88,
        edgecolor="white", linewidth=0.8, label="Mean",
    )

    # Min-max range as thin lines
    for i, (lo, hi) in enumerate(zip(mins, maxs)):
        ax.plot([i, i], [lo, hi], color="#333", linewidth=2, solid_capstyle="round")
        ax.plot(i, lo, "_", color="#333", markersize=8, markeredgewidth=2)
        ax.plot(i, hi, "_", color="#333", markersize=8, markeredgewidth=2)

    ax.set_xlabel("Metric")
    ax.set_ylabel("Value")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(
        [_shorten_label(n.replace("_", " ")) for n in names],
        rotation=35, ha="right",
    )
    ax.legend(loc="upper right")
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved experiment comparison: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 6. Pipeline execution timeline
# ---------------------------------------------------------------------------


def plot_pipeline_timeline(
    stage_results: list[dict[str, Any]],
    output_path: Path,
    *,
    title: str = "Pipeline Execution Timeline",
) -> Path | None:
    """Horizontal bar chart showing execution time per stage."""
    if not HAS_MATPLOTLIB or not stage_results:
        return None

    _setup_academic_style()

    labels: list[str] = []
    durations: list[float] = []
    colors: list[str] = []

    for r in stage_results:
        name = r.get("stage_name", r.get("stage", "?"))
        elapsed = r.get("elapsed_sec", 0)
        status = r.get("status", "done")
        labels.append(str(name))
        durations.append(float(elapsed) if elapsed else 1.0)
        colors.append("#228833" if status == "done" else "#EE6677")

    if not labels:
        return None

    fig, ax = plt.subplots(figsize=(10, max(4, len(labels) * 0.35)))
    y = range(len(labels))
    ax.barh(list(y), durations, color=colors, alpha=0.85, edgecolor="white", height=0.6)
    ax.set_yticks(list(y))
    ax.set_yticklabels(labels, fontsize=9)
    ax.set_xlabel("Time (seconds)")
    ax.set_title(title)
    ax.invert_yaxis()
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved pipeline timeline: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 7. Iteration score history
# ---------------------------------------------------------------------------


def plot_iteration_scores(
    scores: list[float | None],
    output_path: Path,
    *,
    threshold: float = 7.0,
    title: str = "Quality Score by Iteration",
) -> Path | None:
    """Line chart of quality scores across iterations."""
    if not HAS_MATPLOTLIB or not scores:
        return None

    _setup_academic_style()

    valid = [(i + 1, s) for i, s in enumerate(scores) if s is not None]
    if not valid:
        return None

    iters, vals = zip(*valid)

    fig, ax = plt.subplots(figsize=(6, 4.5))
    ax.plot(
        iters, vals, "o-", color=_PAUL_TOL_BRIGHT[5],
        linewidth=2.5, markersize=9, markerfacecolor="white",
        markeredgewidth=2, markeredgecolor=_PAUL_TOL_BRIGHT[5],
    )
    ax.axhline(
        y=threshold, color=_PAUL_TOL_BRIGHT[1], linestyle="--",
        alpha=0.7, linewidth=1.5, label=f"Threshold ({threshold})",
    )
    ax.fill_between(iters, vals, alpha=0.06, color=_PAUL_TOL_BRIGHT[5])
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Quality Score")
    ax.set_title(title)
    ax.set_ylim(0, 10.5)
    ax.legend(loc="lower right")
    fig.tight_layout()
    fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight")
    plt.close(fig)
    logger.info("Saved iteration scores: %s", output_path)
    return output_path


# ---------------------------------------------------------------------------
# 8. All-in-one: generate all charts from run directory
# ---------------------------------------------------------------------------


def generate_all_charts(
    run_dir: Path,
    output_dir: Path | None = None,
    *,
    metric_key: str = "val_loss",
    metric_direction: str = "minimize",
) -> list[Path]:
    """Scan run_dir and generate all applicable charts.

    Returns list of generated image paths.
    """
    if not HAS_MATPLOTLIB:
        logger.warning("matplotlib not available — skipping chart generation")
        return []

    if output_dir is None:
        output_dir = run_dir / "charts"
    output_dir.mkdir(parents=True, exist_ok=True)

    generated: list[Path] = []

    # Collect experiment runs
    runs: list[dict[str, Any]] = []
    for stage_subdir in sorted(run_dir.glob("stage-*/runs")):
        for run_file in sorted(stage_subdir.glob("*.json")):
            try:
                data = json.loads(run_file.read_text(encoding="utf-8"))
                if isinstance(data, dict):
                    runs.append(data)
            except (json.JSONDecodeError, OSError):
                continue

    # 1. Metric trajectory
    path = plot_metric_trajectory(
        runs, metric_key, output_dir / "metric_trajectory.png"
    )
    if path:
        generated.append(path)

    # 2. Load experiment summary for condition-based charts
    # BUG-215: Also search stage-14* versioned dirs when stage-14/ is missing.
    summary_path = run_dir / "stage-14" / "experiment_summary.json"
    if not summary_path.exists():
        for _s14 in sorted(run_dir.glob("stage-14*"), reverse=True):
            _alt = _s14 / "experiment_summary.json"
            if _alt.exists():
                summary_path = _alt
                break
    if summary_path.exists():
        try:
            summary = json.loads(summary_path.read_text(encoding="utf-8"))

            cs = summary.get("condition_summaries", {})
            if cs:
                # 2a. Condition comparison (bar chart with CI)
                path = plot_condition_comparison(
                    cs, output_dir / "method_comparison.png",
                    metric_key=metric_key,
                )
                if path:
                    generated.append(path)

                # 2b. Ablation delta chart (horizontal bars)
                path = plot_ablation_deltas(
                    cs, output_dir / "ablation_analysis.png",
                    metric_key=metric_key,
                    higher_is_better=(metric_direction != "minimize"),
                )
                if path:
                    generated.append(path)

                # 2c. Metric heatmap (condition × metric)
                path = plot_metric_heatmap(
                    cs, output_dir / "metric_heatmap.png",
                )
                if path:
                    generated.append(path)

            # 2d. Raw metrics comparison (fallback, limited)
            ms = summary.get("metrics_summary", {})
            if ms:
                ms = {k: v for k, v in ms.items() if not _is_excluded_metric(k)}
                if ms:
                    path = plot_experiment_comparison(
                        ms, output_dir / "experiment_comparison.png"
                    )
                    if path:
                        generated.append(path)
        except (json.JSONDecodeError, OSError):
            pass

    # 3. Iteration scores
    iter_path = run_dir / "iteration_summary.json"
    if iter_path.exists():
        try:
            iter_data = json.loads(iter_path.read_text(encoding="utf-8"))
            scores = iter_data.get("iteration_scores", [])
            threshold = iter_data.get("quality_threshold", 7.0)
            path = plot_iteration_scores(
                scores, output_dir / "iteration_scores.png", threshold=threshold
            )
            if path:
                generated.append(path)
        except (json.JSONDecodeError, OSError):
            pass

    logger.info("Generated %d chart(s) in %s", len(generated), output_dir)
    return generated


================================================
FILE: researchclaw/feedback/FEEDBACK_ANALYSIS_PROMPT.md
================================================
# Tester Feedback Analysis — Claude Code Prompt

> **用途：** 在 Claude Code agent 窗口中读取本文件，agent 将自动完成「测试反馈分析 → Bug 修复文档生成」的全流程。
>
> **使用方式：** 打开 Claude Code，输入：
> ```
> 请读取 researchclaw/feedback/FEEDBACK_ANALYSIS_PROMPT.md，然后按照指示处理 feedback_inbox/ 目录下的所有测试反馈。
> ```

---

## 你的角色

你是 AutoResearchClaw 项目的高级 QA 工程师和代码架构师。你需要分析来自不同学科领域测试者的反馈，与当前 Pipeline 代码进行对比，生成一份结构化的 **Bug 修复文档**。

---

## 背景

AutoResearchClaw 是一个 23 阶段的全自动学术研究 Pipeline（从选题到论文生成）。我们招募了来自不同学科的测试者运行 Pipeline，他们提交了运行反馈和交付物。你需要帮我把这些反馈转化为可执行的 Bug 修复方案。

### ⚠️ 关键认知：测试者使用的可能是旧版本

**这一点极其重要，必须贯穿你的整个分析过程：**

测试者运行 Pipeline 的时间点各不相同，他们使用的代码版本很可能**不是当前主分支的最新版本**。我们的代码在快速迭代中，很多问题在他们测试之后可能已经被修复、部分修复或因架构调整而不再适用。

因此你必须：
- **不要无条件信任反馈中描述的 Bug** —— 它可能已经不存在了
- **对每个问题都要在当前代码中实际验证** —— 读代码确认，而不是仅凭反馈文字就下结论
- **保持批判性思维** —— 测试者的问题描述可能基于旧的代码行为、旧的配置格式、旧的依赖版本
- **如果反馈中提到的函数/类/文件已被重构或删除，直接标记为「已修复/架构已变更」**
- **如果能从压缩包中识别出测试者使用的版本（如 git hash、版本号、时间戳），请记录下来，有助于判断问题时效性**

---

## 输入：反馈目录结构

所有测试反馈存放在 `feedback_inbox/` 目录下（如果实际路径不同，用户会告知）。结构如下：

```
feedback_inbox/
├── tester_alice/
│   ├── 反馈文档.md              # 反馈文档（可能是 .md / .txt / .docx / .pdf）
│   ├── screenshots/             # 截图文件夹（可选）
│   │   ├── error1.png
│   │   └── stage12_fail.png
│   └── artifacts.zip            # Pipeline 交付物压缩包（论文、代码、各阶段输出）
├── tester_bob/
│   ├── feedback.md
│   └── deliverables.tar.gz
├── tester_charlie/
│   └── 测试报告.txt
└── ...
```

**注意：**
- 每个子文件夹 = 一个测试者
- 反馈文档命名不固定，但通常是唯一的文本文件
- 压缩包内是完整的 Pipeline 运行输出目录
- 有些测试者可能只有反馈文档没有压缩包，反之亦然
- 截图可能散落在子文件夹内或专门的 screenshots/ 目录里

---

## 你的工作流程

### 第一步：扫描并读取所有反馈

1. 列出 `feedback_inbox/` 下所有子目录
2. 对每个子目录：
   - 找到反馈文档并读取全文
   - 如有截图，记录文件名（用于在报告中引用）
   - 如有压缩包，列出内容目录（不必完整解压），重点关注：
     - 错误日志（含 error / fail / traceback 的文件）
     - 阶段输出 JSON（stage_*.json / checkpoint.json）
     - Pipeline 元数据（run_meta.json / pipeline_summary.json）
   - 如果压缩包中有明显的报错信息，提取关键片段

### 第二步：理解当前 Pipeline 架构

在分析之前，你需要了解当前代码的最新状态。请阅读以下关键文件：

- `researchclaw/pipeline/stages.py` — 23 阶段定义和状态机
- `researchclaw/pipeline/executor.py` — 核心执行逻辑（重点关注各阶段的 execute 函数）
- `researchclaw/pipeline/runner.py` — Pipeline 运行入口
- `researchclaw/config.py` — 配置结构
- `researchclaw/llm/client.py` — LLM 调用逻辑
- `researchclaw/literature/search.py` — 文献搜索
- `researchclaw/experiment/docker_sandbox.py` — Docker 沙箱执行
- `researchclaw/pipeline/code_agent.py` — 代码生成 Agent
- `researchclaw/templates/converter.py` — LaTeX 转换
- `researchclaw/prompts.py` — Prompt 模板

**不需要逐行阅读，但要对整体架构和各模块职责有清晰认识。**

### 第三步：逐个分析反馈

对每个测试者的反馈，执行以下分析：

#### 3a. 提取问题列表

从反馈文本中提取每一个独立的问题/Bug/需求，包括：
- 明确报告的 Bug（"XX 阶段报错了"）
- 模糊描述的问题（"效果不太好"、"生成的论文有问题"）
- 功能需求（"希望能支持 XX"）
- UX 问题（"不知道怎么配置"）
- 性能问题（"跑了 3 小时还没完"）

#### 3b. 对比代码进行验证

对每个提取的问题：

1. **定位相关代码** — 根据问题描述和涉及的 Pipeline 阶段，找到对应的源代码文件和函数
2. **判断是否仍然存在** — 阅读当前代码，判断这个 Bug 是否已被修复（主分支在快速迭代，部分问题可能已解决）
3. **分析根因** — 如果 Bug 仍存在，分析具体的根本原因（不是表面现象）
4. **评估价值** — 判断这个问题是否值得修复：
   - **值得修复：** 影响 Pipeline 正常运行、影响论文质量、多人反馈的共性问题
   - **暂缓处理：** 边缘场景、个别配置问题、已有 workaround
   - **不处理：** 设计如此、超出范围的需求、无法复现

#### 3c. 生成修复方案

对每个确认的 Bug，给出：
- **具体是什么 Bug** — 一句话描述
- **根因在哪里** — 哪个文件、哪个函数、什么逻辑有问题
- **怎么修复** — 具体的代码修改方案（不需要写完整代码，但要足够具体，比如"在 executor.py 的 `_run_experiment` 函数中，第 XX 行的异常处理需要增加 TimeoutError 的 catch"）
- **修复后的预期行为** — 修好后应该是什么样的

### 第四步：生成 Bug 修复文档

将所有分析结果汇总为一份 Markdown 文档，保存到 `docs/BUG_FIX_DOCUMENT_<日期>.md`。

---

## 输出文档格式

```markdown
# Bug Fix Document — AutoResearchClaw Pipeline

> 生成日期：YYYY-MM-DD
> 反馈来源：N 位测试者
> 总计问题：N 个

## 📊 总览

| 分类 | 数量 |
|------|------|
| 🔴 确认的 Bug（需修复） | N |
| 🟢 已修复（无需处理） | N |
| 🔵 功能需求 | N |
| 🟡 需要更多信息 | N |
| ⚪ 不处理 | N |

## 🔥 修复优先级

| 优先级 | ID | 问题 | 阶段 | 涉及文件 |
|--------|----|------|------|----------|
| 🔴 CRITICAL | xxx-001 | ... | ... | ... |
| 🟠 HIGH | xxx-002 | ... | ... | ... |
| ... | ... | ... | ... | ... |

---

## 确认的 Bug — 详细修复方案

### 🔴 `xxx-001` — Bug 标题

| 字段 | 内容 |
|------|------|
| **严重程度** | CRITICAL / HIGH / MEDIUM / LOW |
| **所属阶段** | STAGE_NAME |
| **报告者** | tester_id |

**问题描述：**
xxx

**根因分析：**
xxx（具体到文件名、函数名、行号、逻辑问题）

**涉及文件：**
- `researchclaw/xxx/yyy.py`

**修复方案：**
xxx（具体的修改步骤，另一台机器上的 agent 能直接按此执行）

**修复后预期行为：**
xxx

<details>
<summary>原始反馈证据</summary>
（测试者的原话和截图引用）
</details>

---

（重复以上格式，直到所有确认的 Bug 都写完）

---

## 功能需求

### 🔵 `xxx-010` — 需求标题
- 报告者：xxx
- 描述：xxx
- 建议：xxx

---

## 已修复（无需处理）

| ID | 问题 | 报告者 | 已修复原因 |
|----|------|--------|-----------|
| ... | ... | ... | ... |

---

## 附录：按测试者分组

### 测试者：`tester_alice`
- 学科/领域：xxx（如果能从反馈中推断）
- 总计问题：N
- 确认 Bug：N
- 已修复：N

| ID | 问题 | 状态 | 严重程度 |
|----|------|------|---------|
| ... | ... | ... | ... |
```

---

## 重要原则

1. **代码为准：** 判断 Bug 是否存在时，以当前代码为准，不要猜测。实际读代码确认。
2. **具体到位：** 修复方案要具体到文件、函数、逻辑，让另一个 agent 能直接执行。不要只说"需要优化"这种模糊描述。
3. **合并去重：** 多个测试者报告同一个问题时，合并为一条，注明所有报告者。
4. **区分表里：** 测试者描述的可能是表面现象，你需要找到深层根因。
5. **务实判断：** 不是所有反馈都值得处理。有些是配置问题、有些是预期行为、有些修复代价远大于收益 —— 这些需要你做出判断。
6. **保留证据：** 每个问题都保留测试者的原始描述作为证据。
7. **中文输出：** 文档用中文书写（技术术语、代码、文件名保持英文）。

---

## 特殊情况处理

- **反馈文档是英文：** 正常分析，输出仍用中文。
- **没有压缩包只有反馈文档：** 仅基于反馈文本分析，标注"无法验证运行产物"。
- **没有反馈文档只有压缩包：** 从压缩包的错误日志和阶段输出中推断问题。
- **反馈内容模糊难以定位：** 归类为"需要更多信息"，说明缺少什么信息。
- **反馈涉及已删除/重构的功能：** 标记为"已修复"或"架构已变更"。

---

## 第五步：提交并推送

分析完成、文档生成后，你需要完成以下操作：

1. **切换到主分支：** `git checkout main`
2. **将 Bug 修复文档提交到 `docs/` 目录：**
   - 文件命名格式：`docs/BUG_FIX_DOCUMENT_<YYYYMMDD>.md`
   - 如果当天已有同名文档，加序号：`docs/BUG_FIX_DOCUMENT_<YYYYMMDD>_02.md`
3. **提交并推送到远程主分支：**
   ```
   git add docs/BUG_FIX_DOCUMENT_<date>.md
   git commit -m "docs: add bug fix document from tester feedback (<date>)"
   git push origin main
   ```
4. **告知用户：** 推送完成后，告知用户文档路径和 Bug 摘要，以便在其他机器上拉取并执行修复。

**重要：** 提交时不要加 Co-Authored-By，commit 作者只能是用户自己。

---

## 开始

现在请扫描 feedback inbox 目录，开始工作。


================================================
FILE: researchclaw/hardware.py
================================================
"""Hardware detection for GPU-aware experiment execution."""

from __future__ import annotations

import logging
import platform
import subprocess
from dataclasses import asdict, dataclass

logger = logging.getLogger(__name__)

# VRAM threshold (MB) — GPUs with less than this are "limited"
_HIGH_VRAM_THRESHOLD_MB = 8192

# Words that indicate a log/status line rather than a metric
LOG_WORDS: frozenset[str] = frozenset({
    "running", "loading", "saving", "processing", "starting",
    "finished", "completed", "initializing", "downloading",
    "training", "evaluating", "epoch", "step", "iteration",
    "experiment", "warning", "error", "info", "debug",
    "experiments", "using", "setting", "creating", "building",
    "computing", "reading", "writing", "opening", "closing",
})

# Maximum word count for a plausible metric name
_MAX_METRIC_NAME_WORDS = 6


@dataclass(frozen=True)
class HardwareProfile:
    """Detected hardware capabilities of the local machine."""

    has_gpu: bool
    gpu_type: str  # "cuda" | "mps" | "cpu"
    gpu_name: str  # e.g. "NVIDIA RTX 4090" / "Apple M3 Pro" / "CPU only"
    vram_mb: int | None  # NVIDIA only; None for MPS/CPU
    tier: str  # "high" | "limited" | "cpu_only"
    warning: str  # User-facing warning message (empty if tier=high)

    def to_dict(self) -> dict[str, object]:
        return asdict(self)


def detect_hardware() -> HardwareProfile:
    """Detect local GPU hardware and return a HardwareProfile.

    Detection order:
    1. NVIDIA GPU via ``nvidia-smi``
    2. macOS Apple Silicon (MPS) via platform check
    3. Fallback to CPU-only
    """
    # --- Try NVIDIA ---
    profile = _detect_nvidia()
    if profile is not None:
        return profile

    # --- Try macOS MPS (Apple Silicon) ---
    profile = _detect_mps()
    if profile is not None:
        return profile

    # --- CPU only ---
    return HardwareProfile(
        has_gpu=False,
        gpu_type="cpu",
        gpu_name="CPU only",
        vram_mb=None,
        tier="cpu_only",
        warning=(
            "No GPU detected. Only CPU-based experiments (NumPy, sklearn) are supported. "
            "For deep learning research ideas, please use a machine with a GPU or a remote GPU server."
        ),
    )


def _detect_nvidia() -> HardwareProfile | None:
    """Detect NVIDIA GPU via nvidia-smi."""
    try:
        result = subprocess.run(
            [
                "nvidia-smi",
                "--query-gpu=name,memory.total",
                "--format=csv,noheader,nounits",
            ],
            capture_output=True,
            text=True,
            timeout=10,
            check=False,
        )
        if result.returncode != 0:
            return None

        # Parse first GPU line: "NVIDIA GeForce RTX 4090, 24564"
        line = result.stdout.strip().splitlines()[0].strip()
        parts = [p.strip() for p in line.split(",")]
        if len(parts) < 2:
            return None

        gpu_name = parts[0]
        try:
            vram_mb = int(float(parts[1]))
        except (ValueError, IndexError):
            vram_mb = 0

        if vram_mb >= _HIGH_VRAM_THRESHOLD_MB:
            tier = "high"
            warning = ""
        else:
            tier = "limited"
            warning = (
                f"Local GPU ({gpu_name}, {vram_mb} MB VRAM) has limited memory. "
                "Complex deep learning experiments may be slow or run out of memory. "
                "Consider using a remote GPU server for best results."
            )

        return HardwareProfile(
            has_gpu=True,
            gpu_type="cuda",
            gpu_name=gpu_name,
            vram_mb=vram_mb,
            tier=tier,
            warning=warning,
        )
    except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
        return None


def _detect_mps() -> HardwareProfile | None:
    """Detect macOS Apple Silicon GPU (MPS)."""
    if platform.system() != "Darwin":
        return None

    if platform.machine() != "arm64":
        return None

    # Get chip name via sysctl
    gpu_name = "Apple Silicon GPU"
    try:
        result = subprocess.run(
            ["sysctl", "-n", "machdep.cpu.brand_string"],
            capture_output=True,
            text=True,
            timeout=5,
            check=False,
        )
        if result.returncode == 0 and result.stdout.strip():
            gpu_name = result.stdout.strip()
    except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
        pass

    return HardwareProfile(
        has_gpu=True,
        gpu_type="mps",
        gpu_name=gpu_name,
        vram_mb=None,  # MPS shares system memory
        tier="limited",
        warning=(
            f"macOS GPU detected ({gpu_name}). PyTorch MPS backend is available "
            "but has limited performance compared to NVIDIA CUDA GPUs. "
            "For large-scale experiments, consider using a remote GPU server."
        ),
    )


def ensure_torch_available(python_path: str, gpu_type: str) -> bool:
    """Check if PyTorch is importable; attempt install if not.

    Returns True if torch is available after this call.
    """
    from pathlib import Path

    python = Path(python_path)
    if not python.is_absolute():
        python = Path.cwd() / python

    # Check if already installed
    try:
        result = subprocess.run(
            [str(python), "-c", "import torch; print(torch.__version__)"],
            capture_output=True,
            text=True,
            timeout=30,
            check=False,
        )
        if result.returncode == 0:
            version = result.stdout.strip()
            logger.info("PyTorch %s already available at %s", version, python)
            return True
    except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
        return False

    # Not installed — attempt install
    if gpu_type == "cpu":
        logger.info("No GPU available; skipping PyTorch installation")
        return False

    logger.info("PyTorch not found. Attempting install for %s...", gpu_type)
    pip_cmd = [str(python), "-m", "pip", "install", "--quiet", "torch"]

    try:
        result = subprocess.run(
            pip_cmd,
            capture_output=True,
            text=True,
            timeout=300,
            check=False,
        )
        if result.returncode == 0:
            logger.info("PyTorch installed successfully")
            return True
        logger.warning("PyTorch installation failed: %s", result.stderr[:300])
        return False
    except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc:
        logger.warning("PyTorch installation error: %s", exc)
        return False


def is_metric_name(name: str) -> bool:
    """Return True if *name* looks like a metric name rather than a log line.

    Used to filter stdout lines when parsing ``name: value`` metric output.
    """
    words = name.lower().split()
    if len(words) > _MAX_METRIC_NAME_WORDS:
        return False
    if any(w in LOG_WORDS for w in words):
        return False
    return True


================================================
FILE: researchclaw/health.py
================================================
from __future__ import annotations

import importlib
import json
import logging
import os
import shutil
import socket
import sys
import urllib.error
import urllib.request
from collections.abc import Callable as AbcCallable
from collections.abc import Mapping
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import ContextManager, cast

import yaml

from researchclaw.config import RCConfig, validate_config

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class CheckResult:
    name: str
    status: str
    detail: str
    fix: str = ""


@dataclass(frozen=True)
class DoctorReport:
    timestamp: str
    checks: list[CheckResult]
    overall: str

    @property
    def actionable_fixes(self) -> list[str]:
        return [check.fix for check in self.checks if check.fix]

    def to_dict(self) -> dict[str, object]:
        return {
            "timestamp": self.timestamp,
            "overall": self.overall,
            "checks": [
                {
                    "name": check.name,
                    "status": check.status,
                    "detail": check.detail,
                    "fix": check.fix,
                }
                for check in self.checks
            ],
            "actionable_fixes": self.actionable_fixes,
        }


def check_python_version() -> CheckResult:
    version_tuple = (
        int(sys.version_info.major),
        int(sys.version_info.minor),
        int(sys.version_info.micro),
    )
    if version_tuple >= (3, 11, 0):
        return CheckResult(
            name="python_version",
            status="pass",
            detail=(
                f"Python {sys.version_info.major}.{sys.version_info.minor}."
                f"{sys.version_info.micro}"
            ),
        )
    return CheckResult(
        name="python_version",
        status="fail",
        detail=(
            f"Python {sys.version_info.major}.{sys.version_info.minor}."
            f"{sys.version_info.micro} is unsupported"
        ),
        fix="Install Python 3.11 or newer",
    )


def check_yaml_import() -> CheckResult:
    try:
        _ = importlib.import_module("yaml")
    except ImportError:
        return CheckResult(
            name="yaml_import",
            status="fail",
            detail="PyYAML is not importable",
            fix="pip install pyyaml",
        )
    return CheckResult(name="yaml_import", status="pass", detail="PyYAML import ok")


def check_config_valid(config_path: str | Path) -> CheckResult:
    path = Path(config_path)
    if not path.exists():
        return CheckResult(
            name="config_valid",
            status="fail",
            detail=f"Config file not found: {path}",
            fix="Provide --config path to an existing YAML config file",
        )

    try:
        with path.open(encoding="utf-8") as handle:
            data_obj = _load_yaml_object(handle.read())
    except yaml.YAMLError as exc:
        return CheckResult(
            name="config_valid",
            status="fail",
            detail=f"Config YAML parse error: {exc}",
            fix="Fix YAML syntax errors in the config file",
        )
    except OSError as exc:
        return CheckResult(
            name="config_valid",
            status="fail",
            detail=f"Could not read config file: {exc}",
            fix="Verify file permissions and path",
        )

    data: object = {} if data_obj is None else data_obj
    if not isinstance(data, dict):
        return CheckResult(
            name="config_valid",
            status="fail",
            detail="Config root must be a mapping",
            fix="Ensure the config file starts with key-value mappings",
        )
    data_map = cast(Mapping[object, object], data)
    typed_data = {str(key): value for key, value in data_map.items()}
    result = validate_config(typed_data)
    if result.ok:
        return CheckResult(
            name="config_valid", status="pass", detail="Config validation ok"
        )
    return CheckResult(
        name="config_valid",
        status="fail",
        detail="; ".join(result.errors),
        fix="Fix validation errors in config file",
    )


def _models_url(base_url: str) -> str:
    return f"{base_url.rstrip('/')}/models"


def _is_timeout(exc: BaseException) -> bool:
    if isinstance(exc, TimeoutError):
        return True
    if isinstance(exc, socket.timeout):
        return True
    reason = getattr(exc, "reason", None)
    return isinstance(reason, (TimeoutError, socket.timeout))


def check_llm_connectivity(base_url: str) -> CheckResult:
    if not base_url.strip():
        return CheckResult(
            name="llm_connectivity",
            status="fail",
            detail="LLM base URL is empty",
            fix="Set llm.base_url in config",
        )

    url = _models_url(base_url)
    req = urllib.request.Request(url, method="HEAD")

    try:
        with urllib.request.urlopen(req, timeout=5):
            return CheckResult(
                name="llm_connectivity",
                status="pass",
                detail=f"Reachable: {url}",
            )
    except urllib.error.HTTPError as exc:
        if exc.code == 405:
            try:
                with urllib.request.urlopen(url, timeout=5):
                    return CheckResult(
                        name="llm_connectivity",
                        status="pass",
                        detail=f"Reachable: {url}",
                    )
            except urllib.error.HTTPError as get_exc:
                return CheckResult(
                    name="llm_connectivity",
                    status="fail",
                    detail=f"LLM endpoint HTTP {get_exc.code}",
                    fix="Check llm.base_url and provider status",
                )
            except urllib.error.URLError as get_exc:
                if _is_timeout(get_exc):
                    return CheckResult(
                        name="llm_connectivity",
                        status="fail",
                        detail="LLM endpoint unreachable",
                        fix="Verify endpoint URL and network connectivity",
                    )
                return CheckResult(
                    name="llm_connectivity",
                    status="fail",
                    detail=f"LLM connectivity error: {get_exc.reason}",
                    fix="Verify endpoint URL and network connectivity",
                )
            except TimeoutError:
                return CheckResult(
                    name="llm_connectivity",
                    status="fail",
                    detail="LLM endpoint unreachable",
                    fix="Verify endpoint URL and network connectivity",
                )

        return CheckResult(
            name="llm_connectivity",
            status="fail",
            detail=f"LLM endpoint HTTP {exc.code}",
            fix="Check llm.base_url and provider status",
        )
    except urllib.error.URLError as exc:
        if _is_timeout(exc):
            return CheckResult(
                name="llm_connectivity",
                status="fail",
                detail="LLM endpoint unreachable",
                fix="Verify endpoint URL and network connectivity",
            )
        return CheckResult(
            name="llm_connectivity",
            status="fail",
            detail=f"LLM connectivity error: {exc.reason}",
            fix="Verify endpoint URL and network connectivity",
        )
    except TimeoutError:
        return CheckResult(
            name="llm_connectivity",
            status="fail",
            detail="LLM endpoint unreachable",
            fix="Verify endpoint URL and network connectivity",
        )


def _fetch_models(base_url: str, api_key: str = "") -> tuple[int, dict[str, object]]:
    headers: dict[str, str] = {}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    request = urllib.request.Request(_models_url(base_url), headers=headers)
    with _urlopen(request, timeout=5) as response:
        raw_bytes = _read_response_bytes(response)
        payload_map = _load_json_mapping(raw_bytes.decode("utf-8") or "{}")
        payload: dict[str, object] = {
            str(key): value for key, value in payload_map.items()
        }
        return 200, payload


def _read_response_bytes(response: object) -> bytes:
    if not hasattr(response, "read"):
        raise ValueError("Response object has no read method")
    reader_obj = getattr(response, "read", None)
    if reader_obj is None or not isinstance(reader_obj, AbcCallable):
        raise ValueError("Response read attribute is not callable")
    reader = cast(AbcCallable[[], object], reader_obj)
    raw = reader()
    if not isinstance(raw, (bytes, bytearray)):
        raise ValueError("Response body is not bytes")
    return bytes(raw)


def _urlopen(req: str | urllib.request.Request, timeout: int) -> ContextManager[object]:
    return cast(ContextManager[object], urllib.request.urlopen(req, timeout=timeout))


def _load_yaml_object(content: str) -> object:
    return cast(object, yaml.safe_load(content))


def _load_json_mapping(content: str) -> Mapping[object, object]:
    payload_obj = cast(object, json.loads(content))
    if not isinstance(payload_obj, dict):
        raise ValueError("models response must be a JSON object")
    return cast(Mapping[object, object], payload_obj)


def check_api_key_valid(base_url: str, api_key: str) -> CheckResult:
    if not api_key.strip():
        return CheckResult(
            name="api_key_valid",
            status="fail",
            detail="API key is empty",
            fix="Set llm.api_key or environment variable defined by llm.api_key_env",
        )

    try:
        status, _ = _fetch_models(base_url, api_key)
        if status == 200:
            return CheckResult(
                name="api_key_valid",
                status="pass",
                detail="API key accepted",
            )
    except urllib.error.HTTPError as exc:
        if exc.code == 401:
            return CheckResult(
                name="api_key_valid",
                status="fail",
                detail="Invalid API key",
                fix="Set a valid API key for the configured endpoint",
            )
        return CheckResult(
            name="api_key_valid",
            status="warn",
            detail=f"API key check returned HTTP {exc.code}",
            fix="Verify endpoint health and API key permissions",
        )
    except urllib.error.URLError as exc:
        return CheckResult(
            name="api_key_valid",
            status="warn",
            detail=f"Could not verify API key: {exc.reason}",
            fix="Retry when endpoint/network is available",
        )
    except (json.JSONDecodeError, OSError, ValueError) as exc:
        return CheckResult(
            name="api_key_valid",
            status="warn",
            detail=f"Could not verify API key: {exc}",
            fix="Retry when endpoint/network is available",
        )

    return CheckResult(
        name="api_key_valid",
        status="warn",
        detail="Could not verify API key",
        fix="Retry when endpoint/network is available",
    )


def check_model_available(base_url: str, api_key: str, model: str) -> CheckResult:
    """Check if a single model is available (kept for backward compat)."""
    results = _check_models_against_endpoint(base_url, api_key, [model])
    if results is None:
        return CheckResult(
            name="model_available",
            status="warn",
            detail="Could not verify model availability",
            fix="Retry when endpoint/network is available",
        )
    available, _missing = results
    if model in available:
        return CheckResult(
            name="model_available",
            status="pass",
            detail=f"Model available: {model}",
        )
    return CheckResult(
        name="model_available",
        status="fail",
        detail=f"Model {model} not available",
        fix="Update llm.primary_model or endpoint model access",
    )


def check_model_chain(
    base_url: str,
    api_key: str,
    primary_model: str,
    fallback_models: tuple[str, ...] | list[str] = (),
) -> CheckResult:
    """Check the full model fallback chain — pass if ANY model works."""
    all_models = [m for m in [primary_model] + list(fallback_models) if m.strip()]
    if not all_models:
        return CheckResult(
            name="model_chain",
            status="warn",
            detail="No models configured",
            fix="Set llm.primary_model in config",
        )

    results = _check_models_against_endpoint(base_url, api_key, all_models)
    if results is None:
        return CheckResult(
            name="model_chain",
            status="warn",
            detail="Could not verify model availability",
            fix="Retry when endpoint/network is available",
        )

    available, missing = results

    if not available:
        return CheckResult(
            name="model_chain",
            status="fail",
            detail=f"No models available (tested: {', '.join(all_models)})",
            fix="Update llm.primary_model/fallback_models or endpoint model access",
        )

    if missing:
        return CheckResult(
            name="model_chain",
            status="pass",
            detail=(
                f"Fallback chain OK — available: {', '.join(sorted(available))}; "
                f"unavailable: {', '.join(sorted(missing))}"
            ),
        )

    return CheckResult(
        name="model_chain",
        status="pass",
        detail=f"All models available: {', '.join(sorted(available))}",
    )


def _check_models_against_endpoint(
    base_url: str, api_key: str, models: list[str]
) -> tuple[set[str], set[str]] | None:
    """Return (available, missing) sets, or None if endpoint unreachable."""
    if not models or not all(m.strip() for m in models):
        models = [m for m in models if m.strip()]
    if not models:
        return set(), set()

    try:
        _, payload = _fetch_models(base_url, api_key)
    except (
        urllib.error.HTTPError,
        urllib.error.URLError,
        json.JSONDecodeError,
        OSError,
    ):
        return None

    models_obj = payload.get("data")
    endpoint_models = cast(
        list[object] | None, models_obj if isinstance(models_obj, list) else None
    )
    if not isinstance(endpoint_models, list):
        return None

    available_ids: set[str] = set()
    for item in endpoint_models:
        if not isinstance(item, dict):
            continue
        item_map = cast(Mapping[object, object], item)
        model_id_obj = item_map.get("id")
        if isinstance(model_id_obj, str):
            available_ids.add(model_id_obj)

    requested = set(models)
    available = requested & available_ids
    missing = requested - available_ids
    return available, missing


def check_sandbox_python(python_path: str) -> CheckResult:
    if not python_path.strip():
        return CheckResult(
            name="sandbox_python",
            status="warn",
            detail="Sandbox python path is empty",
            fix="Set experiment.sandbox.python_path in config",
        )

    path = Path(python_path)
    if path.exists() and os.access(path, os.X_OK):
        return CheckResult(
            name="sandbox_python",
            status="pass",
            detail=f"Sandbox python found: {path}",
        )
    return CheckResult(
        name="sandbox_python",
        status="warn",
        detail=f"Sandbox python missing or not executable: {path}",
        fix="Install sandbox interpreter or update experiment.sandbox.python_path",
    )


def check_matplotlib() -> CheckResult:
    try:
        _ = importlib.import_module("matplotlib")
    except ImportError:
        return CheckResult(
            name="matplotlib",
            status="warn",
            detail="Not installed; charts will be skipped",
            fix="pip install matplotlib",
        )
    return CheckResult(name="matplotlib", status="pass", detail="matplotlib import ok")


def check_experiment_mode(mode: str) -> CheckResult:
    if mode == "simulated":
        return CheckResult(
            name="experiment_mode",
            status="warn",
            detail="Experiment mode is simulated — results will be synthetic",
            fix="Use sandbox or docker mode for real execution",
        )
    return CheckResult(
        name="experiment_mode",
        status="pass",
        detail=f"Experiment mode: {mode}",
    )


def check_acp_agent(agent_command: str) -> CheckResult:
    """Check that the ACP agent CLI is available on PATH."""
    resolved = shutil.which(agent_command)
    if resolved:
        return CheckResult(
            name="acp_agent",
            status="pass",
            detail=f"ACP agent found: {resolved}",
        )
    return CheckResult(
        name="acp_agent",
        status="fail",
        detail=f"ACP agent '{agent_command}' not found on PATH",
        fix=f"Install {agent_command} or update llm.acp.agent in config",
    )


def check_docker_runtime(config: RCConfig) -> CheckResult:
    """Check Docker daemon, image availability, and optional NVIDIA runtime."""
    from researchclaw.experiment.docker_sandbox import DockerSandbox

    if not DockerSandbox.check_docker_available():
        return CheckResult(
            name="docker_runtime",
            status="fail",
            detail="Docker daemon is not reachable",
            fix="Install and start Docker, or switch to mode: sandbox",
        )

    docker_cfg = config.experiment.docker
    if not DockerSandbox.ensure_image(docker_cfg.image):
        return CheckResult(
            name="docker_runtime",
            status="fail",
            detail=f"Docker image '{docker_cfg.image}' not found locally",
            fix=f"docker build -t {docker_cfg.image} researchclaw/docker/",
        )

    detail = f"Docker OK, image={docker_cfg.image}"
    if docker_cfg.gpu_enabled:
        detail += ", GPU passthrough enabled"

    return CheckResult(name="docker_runtime", status="pass", detail=detail)


def run_doctor(config_path: str | Path) -> DoctorReport:
    """Run all health checks and return report."""
    checks: list[CheckResult] = []
    path = Path(config_path)

    checks.append(check_python_version())
    checks.append(check_yaml_import())
    checks.append(check_config_valid(path))

    base_url = ""
    api_key = ""
    model = ""
    fallback_models: tuple[str, ...] = ()
    sandbox_python_path = ""
    experiment_mode = ""
    provider = ""
    acp_agent_command = "claude"

    try:
        config = RCConfig.load(path, check_paths=False)
        provider = config.llm.provider
        base_url = config.llm.base_url
        api_key = config.llm.api_key or os.environ.get(config.llm.api_key_env, "")
        model = config.llm.primary_model
        fallback_models = config.llm.fallback_models
        sandbox_python_path = config.experiment.sandbox.python_path
        experiment_mode = config.experiment.mode
        acp_agent_command = config.llm.acp.agent
    except (FileNotFoundError, OSError, ValueError, yaml.YAMLError) as exc:
        logger.debug("Could not fully load config for doctor checks: %s", exc)

    if provider == "acp":
        checks.append(check_acp_agent(acp_agent_command))
    else:
        checks.append(check_llm_connectivity(base_url))
        checks.append(check_api_key_valid(base_url, api_key))
        checks.append(check_model_chain(base_url, api_key, model, fallback_models))
    checks.append(check_sandbox_python(sandbox_python_path))
    checks.append(check_matplotlib())
    checks.append(check_experiment_mode(experiment_mode))

    if experiment_mode == "docker":
        try:
            checks.append(check_docker_runtime(config))
        except Exception as exc:  # noqa: BLE001
            logger.debug("Docker health check failed: %s", exc)
            checks.append(
                CheckResult(
                    name="docker_runtime",
                    status="fail",
                    detail=f"Docker health check error: {exc}",
                    fix="Ensure Docker is installed and the daemon is running",
                )
            )

    overall = "fail" if any(c.status == "fail" for c in checks) else "pass"
    return DoctorReport(
        timestamp=datetime.now(timezone.utc).isoformat(timespec="seconds"),
        checks=checks,
        overall=overall,
    )


def print_doctor_report(report: DoctorReport) -> None:
    """Pretty-print doctor report to stdout."""
    icon_by_status = {"pass": "✅", "fail": "❌", "warn": "⚠️"}
    encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
    try:
        for icon in icon_by_status.values():
            icon.encode(encoding)
    except UnicodeEncodeError:
        icon_by_status = {"pass": "[OK]", "fail": "[FAIL]", "warn": "[WARN]"}
    print(f"ResearchClaw Doctor Report ({report.timestamp})")
    for check in report.checks:
        icon = icon_by_status.get(check.status, "-")
        print(f"{icon} {check.name}: {check.detail}")
        if check.fix:
            print(f"   Fix: {check.fix}")

    fail_count = sum(1 for check in report.checks if check.status == "fail")
    warn_count = sum(1 for check in report.checks if check.status == "warn")
    if report.overall == "pass":
        print("Result: PASS")
    else:
        print(f"Result: FAIL ({fail_count} errors, {warn_count} warnings)")


def write_doctor_report(report: DoctorReport, path: Path) -> None:
    """Write report as JSON."""
    path.parent.mkdir(parents=True, exist_ok=True)
    _ = path.write_text(json.dumps(report.to_dict(), indent=2) + "\n", encoding="utf-8")


================================================
FILE: researchclaw/knowledge/__init__.py
================================================
"""Knowledge management — base, adapters."""


================================================
FILE: researchclaw/knowledge/base.py
================================================
"""Knowledge base integration for ARC pipeline.

Supports two backends:
- ``markdown`` (default): Plain Markdown files in ``docs/kb/``
- ``obsidian``: Markdown with Obsidian-compatible wikilinks, tags, and frontmatter

Both backends produce files that are valid Markdown and can be browsed
without any special tooling — the Obsidian backend simply adds extra
metadata that Obsidian can consume.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import yaml


def _utcnow_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")


# ---------------------------------------------------------------------------
# KB Entry
# ---------------------------------------------------------------------------


@dataclass
class KBEntry:
    """A single knowledge-base entry to be written."""

    category: (
        str  # questions | literature | experiments | findings | decisions | reviews
    )
    entry_id: str  # Unique ID (e.g. "goal-define-run-abc")
    title: str
    content: str  # Markdown body
    source_stage: str  # e.g. "01-goal_define"
    run_id: str
    evidence_refs: list[str] | None = None
    tags: list[str] | None = None
    links: list[str] | None = None  # For Obsidian wikilinks


# ---------------------------------------------------------------------------
# Writers
# ---------------------------------------------------------------------------


def _markdown_frontmatter(entry: KBEntry) -> str:
    """Generate YAML frontmatter block."""
    meta: dict[str, Any] = {
        "id": entry.entry_id,
        "title": entry.title,
        "stage": entry.source_stage,
        "run_id": entry.run_id,
        "created": _utcnow_iso(),
    }
    if entry.tags:
        meta["tags"] = entry.tags
    if entry.evidence_refs:
        meta["evidence"] = entry.evidence_refs
    return (
        "---\n"
        + yaml.dump(meta, default_flow_style=False, allow_unicode=True).rstrip()
        + "\n---\n"
    )


def _obsidian_enhancements(entry: KBEntry) -> str:
    """Add Obsidian-compatible wikilinks and tag line at end of content."""
    extras: list[str] = []
    if entry.tags:
        tag_line = " ".join(f"#{t}" for t in entry.tags)
        extras.append(f"\n{tag_line}")
    if entry.links:
        link_line = "Related: " + ", ".join(f"[[{l}]]" for l in entry.links)
        extras.append(link_line)
    return "\n".join(extras)


def write_kb_entry(
    kb_root: Path,
    entry: KBEntry,
    *,
    backend: str = "markdown",
) -> Path:
    """Write a single KB entry to the appropriate category directory.

    Returns the path to the written file.
    """
    category_dir = kb_root / entry.category
    category_dir.mkdir(parents=True, exist_ok=True)

    # Build content
    parts: list[str] = []
    parts.append(_markdown_frontmatter(entry))
    parts.append(f"# {entry.title}\n")
    parts.append(entry.content)

    if backend == "obsidian":
        obs = _obsidian_enhancements(entry)
        if obs:
            parts.append(obs)

    filename = f"{entry.entry_id}.md"
    filepath = category_dir / filename
    filepath.write_text("\n".join(parts), encoding="utf-8")
    return filepath


# ---------------------------------------------------------------------------
# Pipeline KB integration
# ---------------------------------------------------------------------------

KB_CATEGORY_MAP: dict[int, str] = {
    1: "questions",
    2: "questions",
    3: "decisions",
    4: "literature",
    5: "literature",
    6: "literature",
    7: "findings",
    8: "questions",
    9: "decisions",
    10: "experiments",
    11: "decisions",
    12: "experiments",
    13: "experiments",
    14: "findings",
    15: "decisions",
    16: "reviews",
    17: "reviews",
    18: "reviews",
    19: "reviews",
    20: "decisions",
    21: "decisions",
    22: "reviews",
}


def write_stage_to_kb(
    kb_root: Path,
    stage_id: int,
    stage_name: str,
    run_id: str,
    artifacts: list[str],
    stage_dir: Path,
    *,
    backend: str = "markdown",
    topic: str = "",
) -> list[Path]:
    """Write stage results to the knowledge base.

    Reads the primary output artifact and creates a KB entry
    in the appropriate category directory.

    Returns list of paths written.
    """
    category = KB_CATEGORY_MAP.get(stage_id, "findings")
    written: list[Path] = []

    # Read the primary artifact content
    content_parts: list[str] = []
    evidence: list[str] = []
    for artifact_name in artifacts:
        artifact_path = stage_dir / artifact_name.rstrip("/")
        if artifact_path.is_file():
            text = artifact_path.read_text(encoding="utf-8")
            # Truncate very large files for KB entry
            if len(text) > 5000:
                text = text[:5000] + "\n\n... (truncated, see full artifact)\n"
            content_parts.append(text)
            evidence.append(f"stage-{stage_id:02d}/{artifact_name}")
        elif artifact_path.is_dir():
            files = sorted(artifact_path.iterdir())
            content_parts.append(
                f"Directory with {len(files)} files: {', '.join(f.name for f in files[:10])}"
            )
            evidence.append(f"stage-{stage_id:02d}/{artifact_name}/")

    if not content_parts:
        content_parts.append(
            f"Stage {stage_id:02d} ({stage_name}) completed. See artifacts directory for details."
        )

    entry = KBEntry(
        category=category,
        entry_id=f"{stage_name}-{run_id}",
        title=f"Stage {stage_id:02d}: {stage_name.replace('_', ' ').title()}",
        content="\n\n".join(content_parts),
        source_stage=f"{stage_id:02d}-{stage_name}",
        run_id=run_id,
        evidence_refs=evidence,
        tags=[stage_name, f"stage-{stage_id:02d}", f"run-{run_id[:8]}"],
        links=[f"run-{run_id}"] if backend == "obsidian" else None,
    )

    path = write_kb_entry(kb_root, entry, backend=backend)
    written.append(path)
    return written


# ---------------------------------------------------------------------------
# Weekly report generation (#19)
# ---------------------------------------------------------------------------


def generate_weekly_report(
    kb_root: Path,
    run_dirs: list[Path],
    *,
    backend: str = "markdown",
    week_label: str = "",
) -> Path:
    """Generate a weekly summary report from completed pipeline runs.

    Scans ``run_dirs`` for ``pipeline_summary.json`` files and aggregates
    statistics into a Markdown report written to ``kb_root/reviews/``.
    """
    if not week_label:
        week_label = datetime.now(timezone.utc).strftime("%Y-W%W")

    runs_data: list[dict] = []
    for run_dir in run_dirs:
        summary_path = run_dir / "pipeline_summary.json"
        if summary_path.exists():
            runs_data.append(json.loads(summary_path.read_text(encoding="utf-8")))

    # Build report
    total_runs = len(runs_data)
    total_stages = sum(r.get("stages_executed", 0) for r in runs_data)
    total_done = sum(r.get("stages_done", 0) for r in runs_data)
    total_failed = sum(r.get("stages_failed", 0) for r in runs_data)
    total_blocked = sum(r.get("stages_blocked", 0) for r in runs_data)

    report_lines = [
        f"## Summary",
        f"- Week: {week_label}",
        f"- Pipeline runs: {total_runs}",
        f"- Stages executed: {total_stages}",
        f"- Stages completed: {total_done}",
        f"- Stages failed: {total_failed}",
        f"- Stages blocked (gate): {total_blocked}",
        f"- Success rate: {total_done / total_stages * 100:.1f}%"
        if total_stages > 0
        else "- Success rate: N/A",
        "",
        "## Run Details",
    ]
    for rd in runs_data:
        run_id = rd.get("run_id", "unknown")
        report_lines.append(
            f"- **{run_id}**: {rd.get('stages_done', 0)}/{rd.get('stages_executed', 0)} stages done, final={rd.get('final_status', '?')}"
        )

    report_lines.extend(["", "## Recommendations"])
    if total_failed > 0:
        report_lines.append(
            f"- ⚠️ {total_failed} stage failures detected. Review error logs."
        )
    if total_blocked > 0:
        report_lines.append(f"- 🔒 {total_blocked} stages awaiting gate approval.")
    if total_failed == 0 and total_blocked == 0:
        report_lines.append("- ✅ All stages completed successfully.")

    content = "\n".join(report_lines)

    entry = KBEntry(
        category="reviews",
        entry_id=f"weekly-report-{week_label}",
        title=f"Weekly Report — {week_label}",
        content=content,
        source_stage="report",
        run_id=week_label,
        tags=["weekly-report", week_label],
    )
    return write_kb_entry(kb_root, entry, backend=backend)


================================================
FILE: researchclaw/knowledge/graph/__init__.py
================================================
"""Research knowledge graph built on NetworkX.

Extracts entities (Papers, Methods, Datasets, Metrics) and relations
(CITES, EXTENDS, OUTPERFORMS) from literature and experiment results,
enabling research gap discovery and trend analysis.
"""

from researchclaw.knowledge.graph.entities import Entity, EntityType
from researchclaw.knowledge.graph.relations import Relation, RelationType
from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder
from researchclaw.knowledge.graph.query import KnowledgeGraphQuery

__all__ = [
    "Entity",
    "EntityType",
    "Relation",
    "RelationType",
    "KnowledgeGraphBuilder",
    "KnowledgeGraphQuery",
]


================================================
FILE: researchclaw/knowledge/graph/builder.py
================================================
"""Knowledge graph builder — constructs graph from literature and experiments."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.knowledge.graph.entities import Entity, EntityType
from researchclaw.knowledge.graph.relations import Relation, RelationType

logger = logging.getLogger(__name__)


class KnowledgeGraphBuilder:
    """Builds and manages a research knowledge graph.

    Uses dictionaries for storage (compatible with NetworkX-style serialization)
    without requiring networkx as a hard dependency.
    """

    def __init__(self, max_entities: int = 10000) -> None:
        self._entities: dict[str, Entity] = {}
        self._relations: list[Relation] = []
        self._max_entities = max_entities

    @property
    def entity_count(self) -> int:
        """Number of entities in the graph."""
        return len(self._entities)

    @property
    def relation_count(self) -> int:
        """Number of relations in the graph."""
        return len(self._relations)

    def add_entity(self, entity: Entity) -> bool:
        """Add an entity to the graph.

        Args:
            entity: The entity to add.

        Returns:
            True if added, False if capacity reached or duplicate.
        """
        if entity.id in self._entities:
            # Update attributes of existing entity
            existing = self._entities[entity.id]
            merged = {**existing.attributes, **entity.attributes}
            self._entities[entity.id] = Entity(
                id=entity.id,
                entity_type=entity.entity_type,
                name=entity.name or existing.name,
                attributes=merged,
            )
            return True

        if len(self._entities) >= self._max_entities:
            logger.warning("Knowledge graph capacity reached (%d)", self._max_entities)
            return False

        self._entities[entity.id] = entity
        return True

    def add_relation(self, relation: Relation) -> bool:
        """Add a relation to the graph.

        Args:
            relation: The relation to add.

        Returns:
            True if added, False if source or target entity doesn't exist.
        """
        if relation.source_id not in self._entities:
            logger.debug("Source entity not found: %s", relation.source_id)
            return False
        if relation.target_id not in self._entities:
            logger.debug("Target entity not found: %s", relation.target_id)
            return False

        # Check for duplicate
        for existing in self._relations:
            if (
                existing.source_id == relation.source_id
                and existing.target_id == relation.target_id
                and existing.relation_type == relation.relation_type
            ):
                return True  # Already exists

        self._relations.append(relation)
        return True

    def get_entity(self, entity_id: str) -> Entity | None:
        """Get an entity by ID."""
        return self._entities.get(entity_id)

    def get_entities_by_type(self, entity_type: EntityType) -> list[Entity]:
        """Get all entities of a specific type."""
        return [
            e for e in self._entities.values() if e.entity_type == entity_type
        ]

    def get_relations_for(
        self,
        entity_id: str,
        direction: str = "both",
    ) -> list[Relation]:
        """Get relations involving an entity.

        Args:
            entity_id: The entity to query.
            direction: "outgoing", "incoming", or "both".

        Returns:
            List of matching relations.
        """
        results: list[Relation] = []
        for rel in self._relations:
            if direction in ("outgoing", "both") and rel.source_id == entity_id:
                results.append(rel)
            if direction in ("incoming", "both") and rel.target_id == entity_id:
                results.append(rel)
        return results

    def remove_entity(self, entity_id: str) -> bool:
        """Remove an entity and all its relations."""
        if entity_id not in self._entities:
            return False
        del self._entities[entity_id]
        self._relations = [
            r
            for r in self._relations
            if r.source_id != entity_id and r.target_id != entity_id
        ]
        return True

    def add_paper(
        self,
        paper_id: str,
        title: str,
        year: int | None = None,
        authors: list[str] | None = None,
        abstract: str = "",
    ) -> Entity:
        """Convenience method to add a paper entity.

        Args:
            paper_id: Unique paper ID (e.g., arxiv ID).
            title: Paper title.
            year: Publication year.
            authors: List of author names.
            abstract: Paper abstract.

        Returns:
            The created Entity.
        """
        attrs: dict[str, Any] = {}
        if year:
            attrs["year"] = year
        if authors:
            attrs["authors"] = authors
        if abstract:
            attrs["abstract"] = abstract[:500]

        entity = Entity(
            id=paper_id,
            entity_type=EntityType.PAPER,
            name=title,
            attributes=attrs,
        )
        self.add_entity(entity)
        return entity

    def add_method(
        self,
        method_id: str,
        name: str,
        description: str = "",
    ) -> Entity:
        """Convenience method to add a method entity."""
        entity = Entity(
            id=method_id,
            entity_type=EntityType.METHOD,
            name=name,
            attributes={"description": description} if description else {},
        )
        self.add_entity(entity)
        return entity

    def add_dataset(
        self,
        dataset_id: str,
        name: str,
        domain: str = "",
    ) -> Entity:
        """Convenience method to add a dataset entity."""
        entity = Entity(
            id=dataset_id,
            entity_type=EntityType.DATASET,
            name=name,
            attributes={"domain": domain} if domain else {},
        )
        self.add_entity(entity)
        return entity

    def save(self, path: str | Path) -> None:
        """Save graph to JSON file.

        Args:
            path: File path for output.
        """
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)

        data = {
            "entities": [e.to_dict() for e in self._entities.values()],
            "relations": [r.to_dict() for r in self._relations],
        }
        path.write_text(
            json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
        )
        logger.info(
            "Saved knowledge graph: %d entities, %d relations to %s",
            len(self._entities),
            len(self._relations),
            path,
        )

    def load(self, path: str | Path) -> int:
        """Load graph from JSON file.

        Args:
            path: File path to load from.

        Returns:
            Total number of entities loaded.
        """
        path = Path(path)
        if not path.exists():
            return 0

        try:
            data = json.loads(path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError) as exc:
            logger.warning("Failed to load knowledge graph: %s", exc)
            return 0

        for entity_data in data.get("entities", []):
            try:
                entity = Entity.from_dict(entity_data)
                self.add_entity(entity)
            except (ValueError, KeyError) as exc:
                logger.debug("Skipping malformed entity: %s", exc)

        for rel_data in data.get("relations", []):
            try:
                relation = Relation.from_dict(rel_data)
                self.add_relation(relation)
            except (ValueError, KeyError) as exc:
                logger.debug("Skipping malformed relation: %s", exc)

        logger.info(
            "Loaded knowledge graph: %d entities, %d relations",
            self.entity_count,
            self.relation_count,
        )
        return self.entity_count


================================================
FILE: researchclaw/knowledge/graph/entities.py
================================================
"""Entity definitions for the research knowledge graph."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import Any


class EntityType(str, Enum):
    """Types of entities in the knowledge graph."""

    PAPER = "paper"
    METHOD = "method"
    DATASET = "dataset"
    METRIC = "metric"
    AUTHOR = "author"
    CONCEPT = "concept"


@dataclass
class Entity:
    """A node in the knowledge graph.

    Attributes:
        id: Unique identifier (e.g., arxiv ID, method name hash).
        entity_type: The type of entity.
        name: Display name.
        attributes: Additional key-value attributes.
    """

    id: str
    entity_type: EntityType
    name: str
    attributes: dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict[str, Any]:
        """Serialize to dictionary."""
        d = asdict(self)
        d["entity_type"] = self.entity_type.value
        return d

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Entity:
        """Deserialize from dictionary."""
        return cls(
            id=str(data.get("id", "")),
            entity_type=EntityType(data.get("entity_type", "concept")),
            name=str(data.get("name", "")),
            attributes=data.get("attributes") or {},
        )


================================================
FILE: researchclaw/knowledge/graph/query.py
================================================
"""Knowledge graph query engine."""

from __future__ import annotations

import logging
from collections import Counter, defaultdict
from typing import Any

from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder
from researchclaw.knowledge.graph.entities import EntityType
from researchclaw.knowledge.graph.relations import RelationType

logger = logging.getLogger(__name__)


class KnowledgeGraphQuery:
    """Query engine for the research knowledge graph.

    Provides high-level research-oriented queries like finding gaps,
    trending methods, method comparisons, and topic suggestions.
    """

    def __init__(self, graph: KnowledgeGraphBuilder) -> None:
        self._graph = graph

    def find_research_gaps(self, domain: str = "") -> list[str]:
        """Find research gaps — datasets without many methods applied.

        Args:
            domain: Optional domain filter.

        Returns:
            List of gap descriptions.
        """
        datasets = self._graph.get_entities_by_type(EntityType.DATASET)
        methods = self._graph.get_entities_by_type(EntityType.METHOD)

        if domain:
            datasets = [
                d for d in datasets
                if domain.lower() in d.attributes.get("domain", "").lower()
                or domain.lower() in d.name.lower()
            ]

        gaps: list[str] = []
        for dataset in datasets:
            # Count methods applied to this dataset
            rels = self._graph.get_relations_for(dataset.id, direction="incoming")
            method_rels = [
                r for r in rels if r.relation_type == RelationType.USES_DATASET
            ]
            if len(method_rels) < 2 and methods:
                gaps.append(
                    f"Dataset '{dataset.name}' has only {len(method_rels)} "
                    f"method(s) evaluated — potential research opportunity"
                )

        return gaps

    def find_trending_methods(self, min_citations: int = 2) -> list[str]:
        """Find methods with high citation/usage counts.

        Args:
            min_citations: Minimum citation count to qualify.

        Returns:
            List of trending method descriptions.
        """
        methods = self._graph.get_entities_by_type(EntityType.METHOD)
        trending: list[tuple[int, str]] = []

        for method in methods:
            rels = self._graph.get_relations_for(method.id, direction="incoming")
            citation_count = len([
                r for r in rels
                if r.relation_type in (
                    RelationType.EXTENDS,
                    RelationType.APPLIES_METHOD,
                    RelationType.CITES,
                )
            ])
            if citation_count >= min_citations:
                trending.append((citation_count, method.name))

        trending.sort(reverse=True)
        return [
            f"'{name}' — referenced {count} time(s)"
            for count, name in trending
        ]

    def get_method_comparison(
        self,
        method_a: str,
        method_b: str,
    ) -> dict[str, Any]:
        """Compare two methods across shared datasets.

        Args:
            method_a: Name or ID of first method.
            method_b: Name or ID of second method.

        Returns:
            Dict with comparison results.
        """
        entity_a = self._find_method(method_a)
        entity_b = self._find_method(method_b)

        if not entity_a or not entity_b:
            return {
                "error": "One or both methods not found",
                "method_a": method_a,
                "method_b": method_b,
            }

        # Find datasets used by each method
        datasets_a = self._get_datasets_for_method(entity_a.id)
        datasets_b = self._get_datasets_for_method(entity_b.id)

        shared = set(datasets_a.keys()) & set(datasets_b.keys())

        comparison: dict[str, Any] = {
            "method_a": entity_a.name,
            "method_b": entity_b.name,
            "shared_datasets": list(shared),
            "unique_to_a": list(set(datasets_a.keys()) - shared),
            "unique_to_b": list(set(datasets_b.keys()) - shared),
        }

        # Check outperforms relations
        outperforms_a = []
        outperforms_b = []
        for rel in self._graph.get_relations_for(entity_a.id, direction="outgoing"):
            if rel.relation_type == RelationType.OUTPERFORMS and rel.target_id == entity_b.id:
                outperforms_a.append(rel.attributes)
        for rel in self._graph.get_relations_for(entity_b.id, direction="outgoing"):
            if rel.relation_type == RelationType.OUTPERFORMS and rel.target_id == entity_a.id:
                outperforms_b.append(rel.attributes)

        comparison["a_outperforms_b"] = outperforms_a
        comparison["b_outperforms_a"] = outperforms_b

        return comparison

    def suggest_topics(
        self,
        interests: list[str],
        top_k: int = 5,
    ) -> list[str]:
        """Suggest research topics based on graph structure and interests.

        Args:
            interests: List of interest keywords.
            top_k: Number of suggestions.

        Returns:
            List of suggested topics.
        """
        suggestions: list[tuple[float, str]] = []

        # Score entities by relevance to interests
        for entity in self._graph._entities.values():
            score = 0.0
            name_lower = entity.name.lower()
            desc = entity.attributes.get("description", "").lower()
            abstract = entity.attributes.get("abstract", "").lower()
            combined = f"{name_lower} {desc} {abstract}"

            for interest in interests:
                if interest.lower() in combined:
                    score += 1.0

            if score > 0:
                # Boost by connection count
                rels = self._graph.get_relations_for(entity.id)
                score += len(rels) * 0.1
                suggestions.append((score, entity.name))

        # Find gaps as additional suggestions
        gaps = self.find_research_gaps()
        for gap in gaps[:3]:
            for interest in interests:
                if interest.lower() in gap.lower():
                    suggestions.append((0.5, gap))

        suggestions.sort(reverse=True)
        seen: set[str] = set()
        unique: list[str] = []
        for _, text in suggestions:
            if text not in seen:
                seen.add(text)
                unique.append(text)
            if len(unique) >= top_k:
                break

        return unique

    def _find_method(self, name_or_id: str) -> Any:
        """Find a method entity by name or ID."""
        entity = self._graph.get_entity(name_or_id)
        if entity:
            return entity

        for e in self._graph.get_entities_by_type(EntityType.METHOD):
            if e.name.lower() == name_or_id.lower():
                return e
        return None

    def _get_datasets_for_method(self, method_id: str) -> dict[str, Any]:
        """Get datasets that a method has been evaluated on."""
        datasets: dict[str, Any] = {}
        for rel in self._graph.get_relations_for(method_id, direction="outgoing"):
            if rel.relation_type == RelationType.USES_DATASET:
                entity = self._graph.get_entity(rel.target_id)
                if entity:
                    datasets[entity.name] = rel.attributes
        return datasets


================================================
FILE: researchclaw/knowledge/graph/relations.py
================================================
"""Relation definitions for the research knowledge graph."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import Any


class RelationType(str, Enum):
    """Types of relations between entities."""

    CITES = "cites"
    EXTENDS = "extends"
    OUTPERFORMS = "outperforms"
    USES_DATASET = "uses_dataset"
    APPLIES_METHOD = "applies_method"
    EVALUATES_WITH = "evaluates_with"  # metric used for evaluation
    AUTHORED_BY = "authored_by"
    RELATED_TO = "related_to"


@dataclass
class Relation:
    """A directed edge in the knowledge graph.

    Attributes:
        source_id: ID of the source entity.
        target_id: ID of the target entity.
        relation_type: The type of relation.
        attributes: Additional key-value attributes (e.g., metric value).
    """

    source_id: str
    target_id: str
    relation_type: RelationType
    attributes: dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict[str, Any]:
        """Serialize to dictionary."""
        d = asdict(self)
        d["relation_type"] = self.relation_type.value
        return d

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Relation:
        """Deserialize from dictionary."""
        return cls(
            source_id=str(data.get("source_id", "")),
            target_id=str(data.get("target_id", "")),
            relation_type=RelationType(data.get("relation_type", "related_to")),
            attributes=data.get("attributes") or {},
        )


================================================
FILE: researchclaw/knowledge/graph/visualizer.py
================================================
"""Knowledge graph visualization export."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder
from researchclaw.knowledge.graph.entities import EntityType

logger = logging.getLogger(__name__)

# Color mapping for entity types
_TYPE_COLORS: dict[str, str] = {
    EntityType.PAPER: "#4A90D9",
    EntityType.METHOD: "#E74C3C",
    EntityType.DATASET: "#2ECC71",
    EntityType.METRIC: "#F39C12",
    EntityType.AUTHOR: "#9B59B6",
    EntityType.CONCEPT: "#95A5A6",
}


def export_to_dot(graph: KnowledgeGraphBuilder, path: str | Path) -> None:
    """Export graph to Graphviz DOT format.

    Args:
        graph: The knowledge graph to export.
        path: Output file path.
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    lines = ["digraph KnowledgeGraph {"]
    lines.append("  rankdir=LR;")
    lines.append("  node [shape=box, style=filled, fontsize=10];")

    # Nodes
    for entity in graph._entities.values():
        color = _TYPE_COLORS.get(entity.entity_type, "#CCCCCC")
        label = entity.name[:40].replace('"', '\\"')
        lines.append(
            f'  "{entity.id}" [label="{label}", '
            f'fillcolor="{color}", fontcolor="white"];'
        )

    # Edges
    for rel in graph._relations:
        label = rel.relation_type.value
        lines.append(f'  "{rel.source_id}" -> "{rel.target_id}" [label="{label}"];')

    lines.append("}")
    path.write_text("\n".join(lines), encoding="utf-8")
    logger.info("Exported graph to DOT: %s", path)


def export_to_json_cytoscape(
    graph: KnowledgeGraphBuilder,
    path: str | Path,
) -> None:
    """Export graph to Cytoscape.js compatible JSON.

    Args:
        graph: The knowledge graph to export.
        path: Output file path.
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    elements: list[dict[str, Any]] = []

    for entity in graph._entities.values():
        elements.append({
            "data": {
                "id": entity.id,
                "label": entity.name,
                "type": entity.entity_type.value,
                **entity.attributes,
            },
        })

    for i, rel in enumerate(graph._relations):
        elements.append({
            "data": {
                "id": f"edge_{i}",
                "source": rel.source_id,
                "target": rel.target_id,
                "label": rel.relation_type.value,
                **rel.attributes,
            },
        })

    path.write_text(
        json.dumps({"elements": elements}, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    logger.info("Exported graph to Cytoscape JSON: %s", path)


def graph_summary(graph: KnowledgeGraphBuilder) -> str:
    """Generate a text summary of the graph.

    Args:
        graph: The knowledge graph.

    Returns:
        Multi-line summary string.
    """
    from collections import Counter

    type_counts: Counter[str] = Counter()
    for entity in graph._entities.values():
        type_counts[entity.entity_type.value] += 1

    rel_counts: Counter[str] = Counter()
    for rel in graph._relations:
        rel_counts[rel.relation_type.value] += 1

    lines = [
        f"Knowledge Graph Summary: {graph.entity_count} entities, "
        f"{graph.relation_count} relations",
        "",
        "Entity types:",
    ]
    for etype, count in type_counts.most_common():
        lines.append(f"  {etype}: {count}")

    lines.append("")
    lines.append("Relation types:")
    for rtype, count in rel_counts.most_common():
        lines.append(f"  {rtype}: {count}")

    return "\n".join(lines)


================================================
FILE: researchclaw/literature/__init__.py
================================================
"""Real literature search and citation management for ResearchClaw.

Provides API clients for Semantic Scholar and arXiv, plus unified search
with deduplication and BibTeX generation.  All network I/O uses stdlib
``urllib`` — **zero** extra pip dependencies.
"""

from researchclaw.literature.models import Author, Paper
from researchclaw.literature.search import search_papers
from researchclaw.literature.verify import (
    CitationResult,
    VerificationReport,
    VerifyStatus,
    verify_citations,
)

__all__ = [
    "Author",
    "CitationResult",
    "Paper",
    "VerificationReport",
    "VerifyStatus",
    "search_papers",
    "verify_citations",
]


================================================
FILE: researchclaw/literature/arxiv_client.py
================================================
"""arXiv API client powered by the ``arxiv`` library.

The ``arxiv`` pip package (2.4+) provides robust arXiv search with
built-in rate limiting, retries, pagination, and PDF download support.

Public API
----------
- ``search_arxiv(query, limit, sort_by, year_min)`` → ``list[Paper]``
- ``download_pdf(arxiv_id, dirpath)`` → ``Path | None``
- ``get_paper_by_id(arxiv_id)`` → ``Paper | None``

Circuit breaker is preserved for extra resilience beyond the library's
built-in retry logic.
"""

from __future__ import annotations

import logging
import re
import threading
import time
from pathlib import Path
from typing import Any

try:
    import arxiv  # pip install arxiv
except ImportError:
    arxiv = None  # type: ignore[assignment]

from researchclaw.literature.models import Author, Paper

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Circuit breaker (kept for extra safety on top of arxiv library retries)
# ---------------------------------------------------------------------------

_CB_THRESHOLD = 3
_CB_INITIAL_COOLDOWN = 180
_CB_MAX_COOLDOWN = 600

_CB_CLOSED = "closed"
_CB_OPEN = "open"
_CB_HALF_OPEN = "half_open"

_cb_state: str = _CB_CLOSED
_cb_consecutive_429s: int = 0
_cb_cooldown_sec: float = _CB_INITIAL_COOLDOWN
_cb_open_since: float = 0.0
_cb_trip_count: int = 0
_cb_lock = threading.Lock()


def _reset_circuit_breaker() -> None:
    """Reset circuit breaker state (for tests)."""
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    global _cb_open_since, _cb_trip_count  # noqa: PLW0603
    with _cb_lock:
        _cb_state = _CB_CLOSED
        _cb_consecutive_429s = 0
        _cb_cooldown_sec = _CB_INITIAL_COOLDOWN
        _cb_open_since = 0.0
        _cb_trip_count = 0


def _cb_should_allow() -> bool:
    global _cb_state  # noqa: PLW0603
    with _cb_lock:
        if _cb_state == _CB_CLOSED:
            return True
        if _cb_state == _CB_OPEN:
            elapsed = time.monotonic() - _cb_open_since
            if elapsed >= _cb_cooldown_sec:
                _cb_state = _CB_HALF_OPEN
                logger.info("arXiv circuit breaker → HALF_OPEN (%.0fs cooldown elapsed)", elapsed)
                return True
            return False
        return True  # HALF_OPEN: allow probe


def _cb_on_success() -> None:
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    with _cb_lock:
        _cb_consecutive_429s = 0
        if _cb_state != _CB_CLOSED:
            logger.info("arXiv circuit breaker → CLOSED (request succeeded)")
            _cb_state = _CB_CLOSED
            _cb_cooldown_sec = _CB_INITIAL_COOLDOWN


def _cb_on_failure() -> bool:
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    global _cb_open_since, _cb_trip_count  # noqa: PLW0603
    with _cb_lock:
        _cb_consecutive_429s += 1
        if _cb_state == _CB_HALF_OPEN or _cb_consecutive_429s >= _CB_THRESHOLD:
            if _cb_state == _CB_HALF_OPEN:
                _cb_cooldown_sec = min(_cb_cooldown_sec * 2, _CB_MAX_COOLDOWN)
            _cb_state = _CB_OPEN
            _cb_open_since = time.monotonic()
            _cb_trip_count += 1
            logger.warning(
                "arXiv circuit breaker TRIPPED (trip #%d, cooldown %.0fs)",
                _cb_trip_count, _cb_cooldown_sec,
            )
            return True
        return False


# ---------------------------------------------------------------------------
# Shared arxiv.Client instance (reuses connection, respects rate limits)
# ---------------------------------------------------------------------------

_client: arxiv.Client | None = None


def _get_client() -> arxiv.Client:
    """Get or create the shared arxiv Client."""
    global _client  # noqa: PLW0603
    if _client is None:
        _client = arxiv.Client(
            page_size=100,       # fetch up to 100 per API call
            delay_seconds=3.1,   # arXiv requires ≥3s between requests
            num_retries=3,       # built-in retry on failure
        )
    return _client


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def search_arxiv(
    query: str,
    *,
    limit: int = 50,
    sort_by: str = "relevance",
    year_min: int = 0,
) -> list[Paper]:
    """Search arXiv for papers matching *query*.

    Parameters
    ----------
    query:
        Free-text search query. Supports arXiv field syntax
        (e.g., ``ti:transformer``, ``au:vaswani``, ``cat:cs.LG``).
    limit:
        Maximum number of results (up to 300).
    sort_by:
        Sort criterion: "relevance", "submitted_date", or "last_updated".
    year_min:
        If > 0, only return papers published in this year or later.

    Returns
    -------
    list[Paper]
        Parsed papers. Empty list on failure.
    """
    if arxiv is None:
        logger.warning("arxiv library not installed — skipping arXiv search")
        return []
    if not _cb_should_allow():
        logger.info("[rate-limit] arXiv circuit breaker OPEN — skipping")
        return []

    limit = min(limit, 300)

    sort_map = {
        "relevance": arxiv.SortCriterion.Relevance,
        "submitted_date": arxiv.SortCriterion.SubmittedDate,
        "last_updated": arxiv.SortCriterion.LastUpdatedDate,
    }
    criterion = sort_map.get(sort_by, arxiv.SortCriterion.Relevance)

    search = arxiv.Search(
        query=query,
        max_results=limit,
        sort_by=criterion,
        sort_order=arxiv.SortOrder.Descending,
    )

    papers: list[Paper] = []
    try:
        client = _get_client()
        for result in client.results(search):
            paper = _convert_result(result)
            if year_min > 0 and paper.year < year_min:
                continue
            papers.append(paper)
        _cb_on_success()
        logger.info("arXiv: found %d papers for %r", len(papers), query)
    except arxiv.HTTPError as exc:
        logger.warning("arXiv HTTP error: %s", exc)
        _cb_on_failure()
    except arxiv.UnexpectedEmptyPageError:
        logger.warning("arXiv returned unexpected empty page for %r", query)
        _cb_on_failure()
    except Exception as exc:  # noqa: BLE001
        logger.warning("arXiv search failed: %s", exc)
        _cb_on_failure()

    return papers


def get_paper_by_id(arxiv_id: str) -> Paper | None:
    """Fetch a single paper by arXiv ID (e.g., '2301.00001')."""
    if arxiv is None:
        logger.warning("arxiv library not installed — cannot look up %s", arxiv_id)
        return None
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        client = _get_client()
        for result in client.results(search):
            return _convert_result(result)
    except Exception as exc:  # noqa: BLE001
        logger.warning("arXiv ID lookup failed for %s: %s", arxiv_id, exc)
    return None


def download_pdf(
    arxiv_id: str,
    dirpath: str | Path = ".",
    filename: str = "",
) -> Path | None:
    """Download PDF for a given arXiv ID.

    Parameters
    ----------
    arxiv_id:
        arXiv paper ID (e.g., '2301.00001').
    dirpath:
        Directory to save the PDF.
    filename:
        Custom filename. If empty, uses ``{arxiv_id}.pdf``.

    Returns
    -------
    Path | None
        Path to downloaded PDF, or None on failure.
    """
    if arxiv is None:
        logger.warning("arxiv library not installed — cannot download PDF")
        return None
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        client = _get_client()
        for result in client.results(search):
            dirpath = Path(dirpath)
            dirpath.mkdir(parents=True, exist_ok=True)
            fname = filename or f"{arxiv_id.replace('/', '_')}.pdf"
            result.download_pdf(dirpath=str(dirpath), filename=fname)
            pdf_path = dirpath / fname
            logger.info("Downloaded arXiv PDF: %s → %s", arxiv_id, pdf_path)
            return pdf_path
    except Exception as exc:  # noqa: BLE001
        logger.warning("PDF download failed for %s: %s", arxiv_id, exc)
    return None


def search_arxiv_advanced(
    *,
    title: str = "",
    author: str = "",
    abstract: str = "",
    category: str = "",
    limit: int = 50,
    year_min: int = 0,
) -> list[Paper]:
    """Advanced arXiv search using field-specific queries.

    Example: search_arxiv_advanced(title="transformer", category="cs.LG")
    """
    parts = []
    if title:
        parts.append(f"ti:{title}")
    if author:
        parts.append(f"au:{author}")
    if abstract:
        parts.append(f"abs:{abstract}")
    if category:
        parts.append(f"cat:{category}")

    if not parts:
        return []

    query = " AND ".join(parts)
    return search_arxiv(query, limit=limit, year_min=year_min)


# ---------------------------------------------------------------------------
# Internal: convert arxiv.Result → Paper
# ---------------------------------------------------------------------------


def _convert_result(result: arxiv.Result) -> Paper:
    """Convert an ``arxiv.Result`` to our ``Paper`` dataclass."""
    # Extract arXiv ID from entry_id URL
    arxiv_id = ""
    if result.entry_id:
        m = re.search(r"(\d{4}\.\d{4,5})(v\d+)?$", result.entry_id)
        if m:
            arxiv_id = m.group(1)

    # Authors
    authors = tuple(Author(name=a.name) for a in result.authors)

    # Year from published date
    year = result.published.year if result.published else 0

    # DOI
    doi = result.doi or ""

    # Primary category as venue
    venue = result.primary_category or ""

    # Prefer HTML abstract URL
    url = f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else result.entry_id

    return Paper(
        paper_id=f"arxiv-{arxiv_id}" if arxiv_id else f"arxiv-{result.entry_id}",
        title=result.title or "",
        authors=authors,
        year=year,
        abstract=result.summary or "",
        venue=venue,
        citation_count=0,  # arXiv doesn't provide citation counts
        doi=doi,
        arxiv_id=arxiv_id,
        url=url,
        source="arxiv",
    )


================================================
FILE: researchclaw/literature/cache.py
================================================
"""Local query cache for literature search results.

Caches search results by (query, source, limit) hash to avoid
redundant API calls. Cache entries expire after TTL_SEC seconds.
Cache directory: .researchclaw_cache/literature/
"""

from __future__ import annotations

import hashlib
import json
import logging
import time
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

_DEFAULT_CACHE_DIR = Path(".researchclaw_cache") / "literature"
_TTL_SEC = 86400 * 7  # 7 days (default for S2, OpenAlex)

# Per-source TTLs: arXiv updates daily at midnight, so 24h cache is optimal.
# Citation verification results are permanent (verified papers don't change).
_SOURCE_TTL: dict[str, float] = {
    "arxiv": 86400,         # 24 hours — arXiv metadata updates once/day
    "semantic_scholar": 86400 * 3,  # 3 days
    "openalex": 86400 * 3,  # 3 days
    "citation_verify": 86400 * 365,  # ~permanent
}


def _cache_dir(base: Path | None = None) -> Path:
    d = base or _DEFAULT_CACHE_DIR
    d.mkdir(parents=True, exist_ok=True)
    return d


def cache_key(query: str, source: str, limit: int) -> str:
    """Deterministic cache key from query parameters."""
    raw = f"{query.strip().lower()}|{source.strip().lower()}|{limit}"
    return hashlib.sha256(raw.encode()).hexdigest()[:16]


def get_cached(
    query: str,
    source: str,
    limit: int,
    *,
    cache_base: Path | None = None,
    ttl: float | None = None,
) -> list[dict[str, Any]] | None:
    """Return cached results or None if miss/expired.

    If *ttl* is not provided, uses source-specific TTL from
    ``_SOURCE_TTL``, falling back to the global ``_TTL_SEC``.
    """
    d = _cache_dir(cache_base)
    key = cache_key(query, source, limit)
    path = d / f"{key}.json"

    if not path.exists():
        return None

    effective_ttl = ttl if ttl is not None else _SOURCE_TTL.get(source, _TTL_SEC)

    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        ts = data.get("timestamp", 0)
        age_sec = time.time() - ts
        if age_sec > effective_ttl:
            logger.debug("Cache expired for key %s (age=%.0fs > ttl=%.0fs)",
                         key, age_sec, effective_ttl)
            return None
        papers = data.get("papers", [])
        if not isinstance(papers, list):
            return None
        age_str = _format_age(age_sec)
        logger.info(
            "[cache] HIT query=%r source=%s age=%s (%d papers)",
            query[:50], source, age_str, len(papers),
        )
        return papers
    except (json.JSONDecodeError, TypeError, ValueError):
        return None


def _format_age(seconds: float) -> str:
    """Human-readable age string."""
    if seconds < 60:
        return f"{seconds:.0f}s"
    if seconds < 3600:
        return f"{seconds / 60:.0f}m"
    if seconds < 86400:
        return f"{seconds / 3600:.1f}h"
    return f"{seconds / 86400:.1f}d"


def put_cache(
    query: str,
    source: str,
    limit: int,
    papers: list[dict[str, Any]],
    *,
    cache_base: Path | None = None,
) -> None:
    """Write search results to cache."""
    d = _cache_dir(cache_base)
    key = cache_key(query, source, limit)
    path = d / f"{key}.json"

    payload = {
        "query": query,
        "source": source,
        "limit": limit,
        "timestamp": time.time(),
        "papers": papers,
    }
    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
    logger.debug("Cached %d papers for key %s", len(papers), key)


def clear_cache(*, cache_base: Path | None = None) -> int:
    """Remove all cache files. Return count of files deleted."""
    d = _cache_dir(cache_base)
    count = 0
    for f in d.glob("*.json"):
        f.unlink()
        count += 1
    return count


def cache_stats(*, cache_base: Path | None = None) -> dict[str, Any]:
    """Return cache statistics."""
    d = _cache_dir(cache_base)
    files = list(d.glob("*.json"))
    total_bytes = sum(f.stat().st_size for f in files)
    return {
        "entries": len(files),
        "total_bytes": total_bytes,
        "cache_dir": str(d),
    }


================================================
FILE: researchclaw/literature/models.py
================================================
"""Data models for literature search results.

Paper and Author are frozen dataclasses — immutable after creation.
``Paper.to_bibtex()`` generates a valid BibTeX entry from metadata,
and ``Paper.cite_key`` returns a normalised citation key.
"""

from __future__ import annotations

import re
import unicodedata
from dataclasses import dataclass, field


@dataclass(frozen=True)
class Author:
    """A paper author."""

    name: str
    affiliation: str = ""

    def last_name(self) -> str:
        """Return ASCII-folded last name for citation keys."""
        parts = self.name.strip().split()
        raw = parts[-1] if parts else "unknown"
        # Fold accented characters to ASCII
        nfkd = unicodedata.normalize("NFKD", raw)
        ascii_name = nfkd.encode("ascii", "ignore").decode("ascii")
        return re.sub(r"[^a-zA-Z]", "", ascii_name).lower() or "unknown"


@dataclass(frozen=True)
class Paper:
    """A single paper from Semantic Scholar, arXiv, or similar sources.

    Fields are designed to hold the union of metadata available from both
    Semantic Scholar and arXiv APIs.
    """

    paper_id: str
    title: str
    authors: tuple[Author, ...] = ()
    year: int = 0
    abstract: str = ""
    venue: str = ""
    citation_count: int = 0
    doi: str = ""
    arxiv_id: str = ""
    url: str = ""
    source: str = ""  # "semantic_scholar" | "arxiv"
    _bibtex_override: str = field(default="", repr=False)

    # ------------------------------------------------------------------
    # Citation key
    # ------------------------------------------------------------------

    @property
    def cite_key(self) -> str:
        """Normalised citation key: ``lastname<year><keyword>``.

        Example: ``smith2024transformer``
        """
        last = self.authors[0].last_name() if self.authors else "anon"
        yr = str(self.year) if self.year else "0000"
        # First meaningful noun-ish word from title (>3 chars, alpha only)
        kw = ""
        for word in self.title.split():
            cleaned = re.sub(r"[^a-zA-Z]", "", word).lower()
            if len(cleaned) > 3 and cleaned not in _STOPWORDS:
                kw = cleaned
                break
        return f"{last}{yr}{kw}"

    # ------------------------------------------------------------------
    # BibTeX generation
    # ------------------------------------------------------------------

    def to_bibtex(self) -> str:
        """Generate a BibTeX entry string.

        If ``_bibtex_override`` was populated (e.g. from CrossRef), return
        that directly.  Otherwise construct from metadata.
        """
        if self._bibtex_override:
            return self._bibtex_override.strip()

        key = self.cite_key
        authors_str = " and ".join(a.name for a in self.authors) or "Unknown"

        # T1.4: Detect arXiv category codes used as venue (e.g. "cs.CY", "math.OC")
        # These are NOT journal names and must be treated as arXiv preprints.
        import re as _re
        _venue = self.venue or ""
        _is_arxiv_category = bool(
            _re.match(
                r"^(?:cs|math|stat|eess|physics|q-bio|q-fin|astro-ph|cond-mat|"
                r"gr-qc|hep-ex|hep-lat|hep-ph|hep-th|nlin|nucl-ex|nucl-th|"
                r"quant-ph)\.[A-Z]{2}$",
                _venue,
            )
        )

        # Decide entry type
        if _venue and not _is_arxiv_category and any(
            kw in _venue.lower()
            for kw in (
                "conference",
                "proc",
                "workshop",
                "neurips",
                "icml",
                "iclr",
                "aaai",
                "cvpr",
                "acl",
                "emnlp",
                "naacl",
                "eccv",
                "iccv",
                "sigir",
                "kdd",
                "www",
                "ijcai",
            )
        ):
            entry_type = "inproceedings"
            venue_field = f"  booktitle = {{{_venue}}},"
        elif self.arxiv_id and (not _venue or _is_arxiv_category):
            # arXiv paper: use standard format with eprint ID
            entry_type = "article"
            venue_field = f"  journal = {{arXiv preprint arXiv:{self.arxiv_id}}},"
        else:
            entry_type = "article"
            venue_field = (
                f"  journal = {{{_venue or 'Unknown'}}}," if _venue else ""
            )

        lines = [f"@{entry_type}{{{key},"]
        lines.append(f"  title = {{{self.title}}},")
        lines.append(f"  author = {{{authors_str}}},")
        lines.append(f"  year = {{{self.year or 'Unknown'}}},")
        if venue_field:
            lines.append(venue_field)
        if self.doi:
            lines.append(f"  doi = {{{self.doi}}},")
        if self.arxiv_id:
            lines.append(f"  eprint = {{{self.arxiv_id}}},")
            lines.append("  archiveprefix = {arXiv},")
        if self.url:
            lines.append(f"  url = {{{self.url}}},")
        lines.append("}")
        return "\n".join(lines)

    # ------------------------------------------------------------------
    # Serialisation helpers (for JSONL output)
    # ------------------------------------------------------------------

    def to_dict(self) -> dict[str, object]:
        """Serialise to a plain dict for JSON/JSONL output."""
        return {
            "paper_id": self.paper_id,
            "title": self.title,
            "authors": [
                {"name": a.name, "affiliation": a.affiliation} for a in self.authors
            ],
            "year": self.year,
            "abstract": self.abstract,
            "venue": self.venue,
            "citation_count": self.citation_count,
            "doi": self.doi,
            "arxiv_id": self.arxiv_id,
            "url": self.url,
            "source": self.source,
            "cite_key": self.cite_key,
        }


# Common English stopwords to skip when picking a keyword for cite_key
_STOPWORDS = frozenset(
    {
        "the",
        "and",
        "for",
        "with",
        "from",
        "that",
        "this",
        "into",
        "over",
        "upon",
        "about",
        "through",
        "using",
        "based",
        "towards",
        "toward",
        "between",
        "under",
        "more",
        "than",
        "when",
        "what",
        "which",
        "where",
        "does",
        "have",
        "been",
        "some",
        "each",
        "also",
        "much",
        "very",
        "learning",  # too generic for ML papers
    }
)


================================================
FILE: researchclaw/literature/novelty.py
================================================
"""Novelty checker — detects similar existing work before paper generation.

Searches real academic APIs (Semantic Scholar + arXiv) for papers that may
overlap with the proposed research hypotheses.  Produces a structured report
with similarity scores and a go/differentiate/abort recommendation.

Usage
-----
::

    from researchclaw.literature.novelty import check_novelty

    report = check_novelty(
        topic="Adaptive learning rate schedules",
        hypotheses_text=hypotheses_md,
    )
    print(report["novelty_score"])  # 0.72
"""

from __future__ import annotations

import json
import logging
import re
from datetime import datetime, timezone
from difflib import SequenceMatcher
from typing import Any

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Stop words for keyword extraction (overlap with executor's but standalone)
# ---------------------------------------------------------------------------

_STOP_WORDS = frozenset(
    {
        "a",
        "an",
        "the",
        "and",
        "or",
        "but",
        "in",
        "on",
        "of",
        "for",
        "to",
        "with",
        "by",
        "at",
        "from",
        "as",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "will",
        "would",
        "could",
        "should",
        "may",
        "might",
        "can",
        "shall",
        "not",
        "no",
        "nor",
        "so",
        "yet",
        "both",
        "each",
        "every",
        "all",
        "any",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "than",
        "too",
        "very",
        "just",
        "about",
        "above",
        "after",
        "again",
        "between",
        "into",
        "through",
        "during",
        "before",
        "under",
        "over",
        "using",
        "based",
        "via",
        "toward",
        "towards",
        "new",
        "novel",
        "approach",
        "method",
        "study",
        "research",
        "paper",
        "work",
        "propose",
        "proposed",
        "show",
        "results",
        "performance",
        "evaluation",
    }
)


# ---------------------------------------------------------------------------
# Keyword extraction
# ---------------------------------------------------------------------------


def _extract_keywords(text: str) -> list[str]:
    """Extract meaningful keywords from text (lowercased, 3+ chars, no stops)."""
    tokens = re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", text.lower())
    seen: set[str] = set()
    result: list[str] = []
    for t in tokens:
        if t not in _STOP_WORDS and len(t) >= 3 and t not in seen:
            seen.add(t)
            result.append(t)
    return result


# ---------------------------------------------------------------------------
# Similarity metrics
# ---------------------------------------------------------------------------


def _jaccard_keywords(keywords_a: list[str], keywords_b: list[str]) -> float:
    """Jaccard similarity between two keyword lists."""
    set_a = set(keywords_a)
    set_b = set(keywords_b)
    if not set_a or not set_b:
        return 0.0
    return len(set_a & set_b) / len(set_a | set_b)


def _title_similarity(title_a: str, title_b: str) -> float:
    """Sequence-based similarity between two titles (0-1)."""
    return SequenceMatcher(None, title_a.lower(), title_b.lower()).ratio()


def _compute_similarity(
    hypothesis_keywords: list[str],
    paper_title: str,
    paper_abstract: str,
    hypothesis_title: str = "",
) -> float:
    """Combined similarity score between hypotheses keywords and a paper."""
    paper_keywords = _extract_keywords(f"{paper_title} {paper_abstract}")
    kw_sim = _jaccard_keywords(hypothesis_keywords, paper_keywords)
    # Blend keyword overlap with title similarity when available
    if hypothesis_title and paper_title:
        t_sim = _title_similarity(hypothesis_title, paper_title)
        return round(0.7 * kw_sim + 0.3 * t_sim, 4)
    return round(kw_sim, 4)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def check_novelty(
    topic: str,
    hypotheses_text: str,
    *,
    papers_already_seen: list[dict[str, Any]] | None = None,
    max_search_results: int = 30,
    similarity_threshold: float = 0.25,
    s2_api_key: str = "",
) -> dict[str, Any]:
    """Check whether the proposed research has significant overlap with existing work.

    Parameters
    ----------
    topic:
        Research topic string.
    hypotheses_text:
        Full text of generated hypotheses (markdown).
    papers_already_seen:
        Papers already collected by the pipeline (from candidates.jsonl).
        If provided, these are checked for overlap too.
    max_search_results:
        Max papers to retrieve from academic APIs.
    similarity_threshold:
        Minimum similarity to flag a paper as potentially overlapping.
    s2_api_key:
        Optional Semantic Scholar API key.

    Returns
    -------
    dict with keys: topic, hypotheses_checked, similar_papers_found,
    novelty_score, assessment, similar_papers, recommendation, generated.
    """
    # Extract keywords from topic + hypotheses
    combined_text = f"{topic}\n{hypotheses_text}"
    hyp_keywords = _extract_keywords(combined_text)

    # --- Search for similar existing work ---
    similar_papers: list[dict[str, Any]] = []
    total_papers_retrieved = 0  # Track total API results (even below threshold)

    # Build search queries from hypotheses
    queries = _build_novelty_queries(topic, hypotheses_text)

    # Try real API search
    try:
        from researchclaw.literature.search import search_papers_multi_query

        found = search_papers_multi_query(
            queries,
            limit_per_query=min(15, max_search_results),
            s2_api_key=s2_api_key,
        )
        total_papers_retrieved = len(found)
        for paper in found[:max_search_results]:
            sim = _compute_similarity(hyp_keywords, paper.title, paper.abstract)
            if sim >= similarity_threshold:
                similar_papers.append(
                    {
                        "title": paper.title,
                        "paper_id": paper.paper_id,
                        "year": paper.year,
                        "venue": paper.venue,
                        "citation_count": paper.citation_count,
                        "similarity": sim,
                        "url": paper.url,
                        "cite_key": paper.cite_key,
                    }
                )
        logger.info(
            "Novelty search: %d papers found, %d above threshold %.2f",
            len(found),
            len(similar_papers),
            similarity_threshold,
        )
    except Exception:  # noqa: BLE001
        logger.warning(
            "Real novelty search failed, checking pipeline papers only", exc_info=True
        )

    # Also check papers already collected by the pipeline
    if papers_already_seen:
        for p in papers_already_seen:
            if not isinstance(p, dict):
                continue
            title = str(p.get("title", ""))
            abstract = str(p.get("abstract", ""))
            sim = _compute_similarity(hyp_keywords, title, abstract)
            if sim >= similarity_threshold:
                # Avoid duplicates
                existing_titles = {sp["title"].lower() for sp in similar_papers}
                if title.lower() not in existing_titles:
                    similar_papers.append(
                        {
                            "title": title,
                            "paper_id": str(p.get("paper_id", "")),
                            "year": p.get("year", 0),
                            "venue": str(p.get("venue", "")),
                            "citation_count": p.get("citation_count", 0),
                            "similarity": sim,
                            "url": str(p.get("url", "")),
                            "cite_key": str(p.get("cite_key", "")),
                        }
                    )

    # Sort by similarity descending
    similar_papers.sort(key=lambda x: x["similarity"], reverse=True)

    # --- Compute novelty score ---
    novelty_score, assessment = _assess_novelty(similar_papers, similarity_threshold)

    # --- Determine search coverage quality ---
    # If API returned very few papers or none at all, the novelty score is unreliable.
    if total_papers_retrieved == 0 and not papers_already_seen:
        search_coverage = "insufficient"
    elif total_papers_retrieved < 5:
        search_coverage = "partial"
    else:
        search_coverage = "full"

    # When search coverage is insufficient, flag the assessment as unreliable
    # instead of reporting a misleading perfect novelty score.
    if search_coverage == "insufficient" and not similar_papers:
        assessment = "insufficient_data"
        recommendation = "proceed_with_caution"
    elif assessment == "critical":
        recommendation = "abort"
    elif assessment == "low":
        recommendation = "differentiate"
    else:
        recommendation = "proceed"

    # Count hypotheses
    hyp_count = len(re.findall(r"^##\s+H\d+", hypotheses_text, re.MULTILINE))
    if hyp_count == 0:
        hyp_count = len(re.findall(r"hypothesis", hypotheses_text, re.IGNORECASE))
    hyp_count = max(1, hyp_count)

    return {
        "topic": topic,
        "hypotheses_checked": hyp_count,
        "search_queries": queries,
        "similar_papers_found": len(similar_papers),
        "novelty_score": novelty_score,
        "assessment": assessment,
        "similar_papers": similar_papers[:20],  # cap output size
        "recommendation": recommendation,
        "similarity_threshold": similarity_threshold,
        "search_coverage": search_coverage,
        "total_papers_retrieved": total_papers_retrieved,
        "generated": datetime.now(timezone.utc).isoformat(timespec="seconds"),
    }


def _build_novelty_queries(topic: str, hypotheses_text: str) -> list[str]:
    """Build targeted search queries from topic and hypotheses."""
    queries = [topic]

    # Extract hypothesis titles (## H1, ## H2, etc.)
    for match in re.finditer(r"^##\s+H\d+[:\s]*(.+)", hypotheses_text, re.MULTILINE):
        hyp_title = match.group(1).strip()
        if hyp_title and len(hyp_title) > 10:
            queries.append(hyp_title[:200])

    # Extract key phrases from the hypotheses
    keywords = _extract_keywords(hypotheses_text)[:10]
    if keywords:
        # Build a query from top keywords
        kw_query = " ".join(keywords[:5])
        if kw_query not in queries:
            queries.append(kw_query)

    return queries[:5]  # Cap at 5 queries


def _assess_novelty(
    similar_papers: list[dict[str, Any]],
    threshold: float,
) -> tuple[float, str]:
    """Compute overall novelty score and assessment.

    Returns (score, assessment) where score is 0-1 (higher = more novel)
    and assessment is 'high' | 'moderate' | 'low' | 'critical'.
    """
    if not similar_papers:
        return 1.0, "high"

    # Take top-5 most similar
    top = similar_papers[:5]
    max_sim = max(p["similarity"] for p in top)
    avg_sim = sum(p["similarity"] for p in top) / len(top)

    # High-citation papers with high similarity are more concerning
    high_cite_overlap = sum(
        1 for p in top if p["similarity"] >= 0.4 and p.get("citation_count", 0) >= 50
    )

    # Novelty score: inverse of max similarity, adjusted
    raw_score = 1.0 - max_sim
    if high_cite_overlap >= 2:
        raw_score *= 0.7  # penalty for multiple high-impact overlaps

    novelty_score = round(max(0.0, min(1.0, raw_score)), 3)

    # Assessment thresholds
    if novelty_score >= 0.7:
        assessment = "high"
    elif novelty_score >= 0.45:
        assessment = "moderate"
    elif novelty_score >= 0.25:
        assessment = "low"
    else:
        assessment = "critical"

    return novelty_score, assessment


================================================
FILE: researchclaw/literature/openalex_client.py
================================================
"""OpenAlex API client.

Uses stdlib ``urllib`` + ``json`` — zero extra dependencies.

Public API
----------
- ``search_openalex(query, limit, year_min)`` → ``list[Paper]``

Rate limits (with polite pool email):
  - List/filter: 10,000/day
  - Full-text search: 1,000/day

OpenAlex provides generous rate limits and indexes arXiv, PubMed,
CrossRef, and many other sources — making it an excellent primary
search backend that reduces pressure on arXiv and Semantic Scholar.
"""

from __future__ import annotations

import json
import logging
import random
import re
import threading
import time
import urllib.error
import urllib.parse
import urllib.request
from typing import Any

from researchclaw.literature.models import Author, Paper

logger = logging.getLogger(__name__)

_BASE_URL = "https://api.openalex.org/works"
_POLITE_EMAIL = "researchclaw@users.noreply.github.com"
_MAX_PER_REQUEST = 50
_MAX_RETRIES = 3
_MAX_WAIT_SEC = 60
_TIMEOUT_SEC = 20
_RATE_LIMIT_SEC = 0.2  # OpenAlex is generous; 200ms is more than enough


# Last request timestamp for rate limiting
_last_request_time: float = 0.0
_rate_lock = threading.Lock()


def search_openalex(
    query: str,
    *,
    limit: int = 20,
    year_min: int = 0,
    email: str = _POLITE_EMAIL,
) -> list[Paper]:
    """Search OpenAlex for papers matching *query*.

    Parameters
    ----------
    query:
        Free-text search query.
    limit:
        Maximum number of results (capped at 50).
    year_min:
        If >0, restrict to papers published in this year or later.
    email:
        Polite pool email for higher rate limits.

    Returns
    -------
    list[Paper]
        Parsed papers.  Empty list on network failure.
    """
    global _last_request_time  # noqa: PLW0603

    # Rate limiting (locked to serialize concurrent callers)
    with _rate_lock:
        now = time.monotonic()
        elapsed = now - _last_request_time
        if elapsed < _RATE_LIMIT_SEC:
            time.sleep(_RATE_LIMIT_SEC - elapsed)
        _last_request_time = time.monotonic()

    limit = min(limit, _MAX_PER_REQUEST)

    # Build filter string
    filters = []
    if year_min > 0:
        filters.append(f"from_publication_date:{year_min}-01-01")

    params: dict[str, str] = {
        "search": query,
        "per_page": str(limit),
        "mailto": email,
        "select": (
            "id,title,authorships,publication_year,primary_location,"
            "cited_by_count,doi,ids,abstract_inverted_index,type"
        ),
    }
    if filters:
        params["filter"] = ",".join(filters)

    url = f"{_BASE_URL}?{urllib.parse.urlencode(params)}"
    data = _request_with_retry(url, email)
    if data is None:
        return []

    results = data.get("results", [])
    if not isinstance(results, list):
        return []

    papers: list[Paper] = []
    for item in results:
        try:
            papers.append(_parse_openalex_work(item))
        except Exception:  # noqa: BLE001
            logger.debug("Failed to parse OpenAlex work: %s", item.get("id", "?"))
    return papers


# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------


def _request_with_retry(
    url: str,
    email: str,
) -> dict[str, Any] | None:
    """GET *url* with exponential back-off retries."""
    for attempt in range(_MAX_RETRIES):
        try:
            req = urllib.request.Request(
                url,
                headers={
                    "Accept": "application/json",
                    "User-Agent": f"ResearchClaw/1.0 (mailto:{email})",
                },
            )
            with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp:
                body = resp.read().decode("utf-8")
                return json.loads(body)
        except urllib.error.HTTPError as exc:
            if exc.code == 429:
                retry_after = exc.headers.get("Retry-After") if exc.headers else None
                if retry_after:
                    try:
                        wait = float(retry_after)
                    except (ValueError, TypeError):
                        wait = 2 ** (attempt + 1)
                else:
                    wait = 2 ** (attempt + 1)
                # BUG-22: If Retry-After is absurdly long (>300s), skip immediately
                if wait > 300:
                    logger.warning(
                        "[rate-limit] OpenAlex Retry-After=%s (>300s). "
                        "Skipping request instead of waiting.",
                        retry_after,
                    )
                    return None
                wait = min(wait, _MAX_WAIT_SEC)
                jitter = random.uniform(0, wait * 0.2)
                logger.warning(
                    "[rate-limit] OpenAlex 429 (Retry-After: %s). "
                    "Waiting %.1fs (attempt %d/%d)...",
                    retry_after or "none",
                    wait + jitter,
                    attempt + 1,
                    _MAX_RETRIES,
                )
                time.sleep(wait + jitter)
                continue

            if exc.code in (500, 502, 503, 504):
                wait = 2 ** attempt
                jitter = random.uniform(0, wait * 0.2)
                logger.warning(
                    "OpenAlex HTTP %d. Retry %d/%d in %.0fs...",
                    exc.code,
                    attempt + 1,
                    _MAX_RETRIES,
                    wait + jitter,
                )
                time.sleep(wait + jitter)
                continue

            logger.warning("OpenAlex HTTP %d for %s", exc.code, url)
            return None

        except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc:
            wait = min(2**attempt, _MAX_WAIT_SEC)
            jitter = random.uniform(0, wait * 0.2)
            logger.warning(
                "OpenAlex request failed (%s). Retry %d/%d in %ds...",
                exc,
                attempt + 1,
                _MAX_RETRIES,
                wait,
            )
            time.sleep(wait + jitter)

    logger.error("OpenAlex request exhausted retries for: %s", url)
    return None


def _reconstruct_abstract(inverted_index: dict[str, list[int]] | None) -> str:
    """Reconstruct abstract from OpenAlex inverted index format."""
    if not inverted_index or not isinstance(inverted_index, dict):
        return ""
    # Build word -> position mapping
    words: list[tuple[int, str]] = []
    for word, positions in inverted_index.items():
        for pos in positions:
            words.append((pos, word))
    words.sort(key=lambda x: x[0])
    return " ".join(w for _, w in words)


def _parse_openalex_work(item: dict[str, Any]) -> Paper:
    """Convert a single OpenAlex work JSON to a ``Paper``."""
    # Title
    title = str(item.get("title") or "").strip()
    title = re.sub(r"\s+", " ", title)

    # Authors
    authorships = item.get("authorships") or []
    authors = tuple(
        Author(
            name=str(a.get("author", {}).get("display_name", "Unknown")),
            affiliation=str(
                (a.get("institutions") or [{}])[0].get("display_name", "")
                if a.get("institutions")
                else ""
            ),
        )
        for a in authorships
        if isinstance(a, dict)
    )

    # Year
    year = int(item.get("publication_year") or 0)

    # Abstract (inverted index format)
    abstract = _reconstruct_abstract(item.get("abstract_inverted_index"))

    # Venue from primary_location
    primary_loc = item.get("primary_location") or {}
    source_info = primary_loc.get("source") or {}
    venue = str(source_info.get("display_name") or "").strip()
    # BUG-33: arXiv category codes (e.g. cs.LG, stat.ML) are not proper venue names
    if venue and re.match(r"^[a-z]{2,}\.[A-Z]{2}$", venue):
        venue = ""

    # Citation count
    citation_count = int(item.get("cited_by_count") or 0)

    # DOI
    raw_doi = str(item.get("doi") or "").strip()
    doi = raw_doi.replace("https://doi.org/", "").replace("http://doi.org/", "")

    # IDs
    ids = item.get("ids") or {}
    openalex_id = str(ids.get("openalex") or item.get("id") or "").strip()

    # arXiv ID from ids or DOI
    arxiv_id = ""
    raw_arxiv = str(ids.get("arxiv") or "").strip()
    if raw_arxiv:
        # Extract numeric ID from URLs like https://arxiv.org/abs/2301.00001
        m = re.search(r"(\d{4}\.\d{4,5})", raw_arxiv)
        if m:
            arxiv_id = m.group(1)

    # URL
    url = ""
    if arxiv_id:
        url = f"https://arxiv.org/abs/{arxiv_id}"
    elif doi:
        url = f"https://doi.org/{doi}"
    elif openalex_id:
        url = openalex_id

    # Paper ID
    paper_id = f"oalex-{openalex_id.split('/')[-1]}" if openalex_id else f"oalex-{title[:20]}"

    return Paper(
        paper_id=paper_id,
        title=title,
        authors=authors,
        year=year,
        abstract=abstract,
        venue=venue,
        citation_count=citation_count,
        doi=doi,
        arxiv_id=arxiv_id,
        url=url,
        source="openalex",
    )


================================================
FILE: researchclaw/literature/search.py
================================================
"""Unified literature search with deduplication.

Combines results from OpenAlex, Semantic Scholar, and arXiv,
deduplicates by DOI → arXiv ID → fuzzy title match, and returns
a merged list sorted by citation count (descending).

Source priority: OpenAlex (most generous limits) → Semantic Scholar → arXiv.
If any source hits rate limits, remaining sources compensate automatically.

Public API
----------
- ``search_papers(query, limit, sources, year_min, deduplicate)``
  → ``list[Paper]``
"""

from __future__ import annotations

from collections.abc import Callable, Sequence
from dataclasses import asdict
import importlib
import logging
import re
import time
import urllib.error
from typing import cast

from researchclaw.literature.arxiv_client import search_arxiv
from researchclaw.literature.models import Author, Paper
from researchclaw.literature.openalex_client import search_openalex
from researchclaw.literature.semantic_scholar import search_semantic_scholar

logger = logging.getLogger(__name__)

# OpenAlex first (10K/day), then S2 (1K/5min), then arXiv (1/3s) — least
# pressure on the most restrictive API.
_DEFAULT_SOURCES = ("openalex", "semantic_scholar", "arxiv")


CacheGet = Callable[[str, str, int], list[dict[str, object]] | None]
CachePut = Callable[[str, str, int, list[dict[str, object]]], None]


def _cache_api() -> tuple[CacheGet, CachePut]:
    cache_mod = importlib.import_module("researchclaw.literature.cache")
    return cast(CacheGet, cache_mod.get_cached), cast(CachePut, cache_mod.put_cache)


def _papers_to_dicts(papers: list[Paper]) -> list[dict[str, object]]:
    """Convert papers to serializable dicts for caching."""
    return [asdict(p) for p in papers]


def _as_int(value: object, default: int = 0) -> int:
    if isinstance(value, int):
        return value
    if isinstance(value, float):
        return int(value)
    if isinstance(value, str):
        try:
            return int(value)
        except ValueError:
            return default
    return default


def _dicts_to_papers(dicts: list[dict[str, object]]) -> list[Paper]:
    """Reconstruct Paper objects from cached dicts."""
    papers: list[Paper] = []
    for d in dicts:
        try:
            authors_raw = d.get("authors", ())
            if not isinstance(authors_raw, list):
                authors_raw = []
            authors = tuple(
                Author(
                    name=str(cast(dict[str, object], a).get("name", "")),
                    affiliation=str(cast(dict[str, object], a).get("affiliation", "")),
                )
                for a in authors_raw
                if isinstance(a, dict)
            )
            paper_id = cast(str, d["paper_id"])
            title = cast(str, d["title"])
            papers.append(
                Paper(
                    paper_id=paper_id,
                    title=title,
                    authors=authors,
                    year=_as_int(d.get("year", 0), 0),
                    abstract=str(d.get("abstract", "")),
                    venue=str(d.get("venue", "")),
                    citation_count=_as_int(d.get("citation_count", 0), 0),
                    doi=str(d.get("doi", "")),
                    arxiv_id=str(d.get("arxiv_id", "")),
                    url=str(d.get("url", "")),
                    source=str(d.get("source", "")),
                )
            )
        except (KeyError, TypeError, ValueError):
            continue
    return papers


def search_papers(
    query: str,
    *,
    limit: int = 20,
    sources: Sequence[str] = _DEFAULT_SOURCES,
    year_min: int = 0,
    deduplicate: bool = True,
    s2_api_key: str = "",
) -> list[Paper]:
    """Search multiple academic sources and return deduplicated results.

    Parameters
    ----------
    query:
        Free-text search query.
    limit:
        Maximum results *per source*.
    sources:
        Which backends to query.  Default: both S2 and arXiv.
    year_min:
        If >0, pass to backends that support year filtering.
    deduplicate:
        Whether to remove duplicates across sources.
    s2_api_key:
        Optional Semantic Scholar API key.

    Returns
    -------
    list[Paper]
        Merged results, sorted by citation_count descending.
    """
    all_papers: list[Paper] = []
    cache_get: CacheGet
    cache_put: CachePut
    cache_get, cache_put = _cache_api()

    source_stats: dict[str, int] = {}  # track per-source counts
    cache_hits = 0

    for src in sources:
        src_lower = src.lower().replace("-", "_").replace(" ", "_")
        cache_source = (
            "semantic_scholar" if src_lower in ("semantic_scholar", "s2") else src_lower
        )
        try:
            if src_lower == "openalex":
                papers = search_openalex(
                    query,
                    limit=limit,
                    year_min=year_min,
                )
                all_papers.extend(papers)
                cache_put(query, "openalex", limit, _papers_to_dicts(papers))
                source_stats["openalex"] = len(papers)
                logger.info(
                    "OpenAlex returned %d papers for %r", len(papers), query
                )
                time.sleep(0.5)

            elif src_lower in ("semantic_scholar", "s2"):
                papers = search_semantic_scholar(
                    query,
                    limit=limit,
                    year_min=year_min,
                    api_key=s2_api_key,
                )
                all_papers.extend(papers)
                cache_put(query, "semantic_scholar", limit, _papers_to_dicts(papers))
                source_stats["semantic_scholar"] = len(papers)
                logger.info(
                    "Semantic Scholar returned %d papers for %r", len(papers), query
                )
                # Rate-limit gap before next source
                time.sleep(1.0)

            elif src_lower == "arxiv":
                papers = search_arxiv(query, limit=limit, year_min=year_min)
                all_papers.extend(papers)
                cache_put(query, "arxiv", limit, _papers_to_dicts(papers))
                source_stats["arxiv"] = len(papers)
                logger.info("arXiv returned %d papers for %r", len(papers), query)

            else:
                logger.warning("Unknown literature source: %s (skipped)", src)
        except (
            OSError,
            RuntimeError,
            TypeError,
            ValueError,
            urllib.error.HTTPError,
            urllib.error.URLError,
        ):
            logger.warning(
                "[rate-limit] Source %s failed for %r — trying cache", src, query
            )
            cached = cache_get(query, cache_source, limit)
            if cached:
                papers = _dicts_to_papers(cached)
                all_papers.extend(papers)
                cache_hits += len(papers)
                logger.info(
                    "[cache] HIT: %d papers for %s/%r", len(papers), src, query
                )
            else:
                logger.warning(
                    "No cache available for %s/%r — skipping", src, query
                )

    # Summary log
    total = len(all_papers)
    parts = [f"{src}: {n}" for src, n in source_stats.items()]
    if cache_hits:
        parts.append(f"cache: {cache_hits}")
    logger.info(
        "[literature] Found %d papers (%s) for %r",
        total,
        ", ".join(parts) if parts else "none",
        query,
    )

    if deduplicate:
        all_papers = _deduplicate(all_papers)

    # Sort by citation count descending, then year descending
    all_papers.sort(key=lambda p: (p.citation_count, p.year), reverse=True)

    return all_papers


def search_papers_multi_query(
    queries: list[str],
    *,
    limit_per_query: int = 20,
    sources: Sequence[str] = _DEFAULT_SOURCES,
    year_min: int = 0,
    s2_api_key: str = "",
    inter_query_delay: float = 1.5,
) -> list[Paper]:
    """Run multiple queries and return deduplicated union.

    Adds a delay between queries to respect rate limits.
    """
    all_papers: list[Paper] = []

    for i, q in enumerate(queries):
        if i > 0:
            time.sleep(inter_query_delay)
        results = search_papers(
            q,
            limit=limit_per_query,
            sources=sources,
            year_min=year_min,
            s2_api_key=s2_api_key,
            deduplicate=False,  # we dedup globally below
        )
        all_papers.extend(results)
        logger.info("Query %d/%d %r → %d papers", i + 1, len(queries), q, len(results))

    deduped = _deduplicate(all_papers)
    deduped.sort(key=lambda p: (p.citation_count, p.year), reverse=True)
    return deduped


# ------------------------------------------------------------------
# Deduplication
# ------------------------------------------------------------------


def _normalise_title(title: str) -> str:
    """Lower-case, strip punctuation, collapse whitespace."""
    t = title.lower()
    t = re.sub(r"[^a-z0-9\s]", "", t)
    return re.sub(r"\s+", " ", t).strip()


def _deduplicate(papers: list[Paper]) -> list[Paper]:
    """Remove duplicates.  Priority: DOI > arXiv ID > fuzzy title.

    When a duplicate is found, the entry with higher citation_count wins
    (i.e. Semantic Scholar data is preferred over arXiv-only data).
    """
    seen_doi: dict[str, int] = {}
    seen_arxiv: dict[str, int] = {}
    seen_title: dict[str, int] = {}
    result: list[Paper] = []

    def _update_indices(p: Paper, idx: int) -> None:
        """Register all identifiers of *p* in the lookup dicts at *idx*."""
        if p.doi:
            seen_doi[p.doi.lower().strip()] = idx
        if p.arxiv_id:
            seen_arxiv[p.arxiv_id.strip()] = idx
        norm = _normalise_title(p.title)
        if norm:
            seen_title[norm] = idx

    def _replace_at(old: Paper, new: Paper, idx: int) -> None:
        """Replace paper at *idx* and clean up stale index entries."""
        # Remove old identifiers that the new paper does NOT share
        if old.doi:
            old_doi = old.doi.lower().strip()
            new_doi = new.doi.lower().strip() if new.doi else ""
            if old_doi != new_doi and seen_doi.get(old_doi) == idx:
                del seen_doi[old_doi]
        if old.arxiv_id:
            old_ax = old.arxiv_id.strip()
            new_ax = new.arxiv_id.strip() if new.arxiv_id else ""
            if old_ax != new_ax and seen_arxiv.get(old_ax) == idx:
                del seen_arxiv[old_ax]
        old_norm = _normalise_title(old.title)
        new_norm = _normalise_title(new.title)
        if old_norm and old_norm != new_norm and seen_title.get(old_norm) == idx:
            del seen_title[old_norm]
        result[idx] = new
        _update_indices(new, idx)

    for paper in papers:
        is_dup = False

        # Check DOI
        if paper.doi:
            doi_key = paper.doi.lower().strip()
            if doi_key in seen_doi:
                idx = seen_doi[doi_key]
                if paper.citation_count > result[idx].citation_count:
                    _replace_at(result[idx], paper, idx)
                is_dup = True

        # Check arXiv ID
        if not is_dup and paper.arxiv_id:
            ax_key = paper.arxiv_id.strip()
            if ax_key in seen_arxiv:
                idx = seen_arxiv[ax_key]
                if paper.citation_count > result[idx].citation_count:
                    _replace_at(result[idx], paper, idx)
                is_dup = True

        # Check fuzzy title
        if not is_dup:
            norm = _normalise_title(paper.title)
            if norm and norm in seen_title:
                idx = seen_title[norm]
                if paper.citation_count > result[idx].citation_count:
                    _replace_at(result[idx], paper, idx)
                is_dup = True

        if is_dup:
            continue

        # Not a duplicate — store indices and append
        new_idx = len(result)
        _update_indices(paper, new_idx)
        result.append(paper)

    return result


def papers_to_bibtex(papers: Sequence[Paper]) -> str:
    """Generate a combined BibTeX file from a list of papers."""
    entries = [p.to_bibtex() for p in papers]
    return "\n\n".join(entries) + "\n"


================================================
FILE: researchclaw/literature/semantic_scholar.py
================================================
"""Semantic Scholar API client.

Uses only stdlib ``urllib`` — zero extra dependencies.

Public API
----------
- ``search_semantic_scholar(query, limit, year_min)`` → ``list[Paper]``

Rate limit: 1 req/s (free, no API key).  Retries up to 3 times with
exponential back-off on transient failures.

Circuit breaker has three states:
  CLOSED → normal operation
  OPEN   → skip all requests, auto-recover after cooldown
  HALF_OPEN → try one probe request, success→CLOSED, fail→OPEN (doubled cooldown)
"""

from __future__ import annotations

import json
import logging
import random
import threading
import time
import urllib.error
import urllib.parse
import urllib.request
from typing import Any

from researchclaw.literature.models import Author, Paper

logger = logging.getLogger(__name__)

_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
_FIELDS = "paperId,title,abstract,year,venue,citationCount,authors,externalIds,url"
_MAX_PER_REQUEST = 100
_RATE_LIMIT_SEC = 1.5  # conservative spacing between requests
_MAX_RETRIES = 3
_MAX_WAIT_SEC = 60
_TIMEOUT_SEC = 30

# ---------------------------------------------------------------------------
# Three-state circuit breaker
# ---------------------------------------------------------------------------

_CB_THRESHOLD = 3           # consecutive 429s to trip
_CB_INITIAL_COOLDOWN = 120  # seconds before first HALF_OPEN probe
_CB_MAX_COOLDOWN = 600      # cap cooldown at 10 minutes

# States
_CB_CLOSED = "closed"
_CB_OPEN = "open"
_CB_HALF_OPEN = "half_open"

_cb_state: str = _CB_CLOSED
_cb_consecutive_429s: int = 0
_cb_cooldown_sec: float = _CB_INITIAL_COOLDOWN
_cb_open_since: float = 0.0  # monotonic timestamp when breaker opened
_cb_trip_count: int = 0      # total number of trips in this process
_cb_lock = threading.Lock()


def _reset_circuit_breaker() -> None:
    """Reset circuit breaker state (for tests)."""
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    global _cb_open_since, _cb_trip_count  # noqa: PLW0603
    with _cb_lock:
        _cb_state = _CB_CLOSED
        _cb_consecutive_429s = 0
        _cb_cooldown_sec = _CB_INITIAL_COOLDOWN
        _cb_open_since = 0.0
        _cb_trip_count = 0


def _cb_should_allow() -> bool:
    """Check if circuit breaker allows a request."""
    global _cb_state  # noqa: PLW0603
    with _cb_lock:
        if _cb_state == _CB_CLOSED:
            return True
        if _cb_state == _CB_OPEN:
            elapsed = time.monotonic() - _cb_open_since
            if elapsed >= _cb_cooldown_sec:
                _cb_state = _CB_HALF_OPEN
                logger.info(
                    "S2 circuit breaker → HALF_OPEN after %.0fs cooldown. "
                    "Trying one probe request...",
                    elapsed,
                )
                return True
            return False
        # HALF_OPEN: allow the probe
        return True


def _cb_on_success() -> None:
    """Record a successful request."""
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    with _cb_lock:
        _cb_consecutive_429s = 0
        if _cb_state != _CB_CLOSED:
            logger.info("S2 circuit breaker → CLOSED (request succeeded)")
            _cb_state = _CB_CLOSED
            _cb_cooldown_sec = _CB_INITIAL_COOLDOWN  # reset cooldown


def _cb_on_429() -> bool:
    """Record a 429 response. Returns True if breaker is now OPEN."""
    global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec  # noqa: PLW0603
    global _cb_open_since, _cb_trip_count  # noqa: PLW0603
    with _cb_lock:
        _cb_consecutive_429s += 1

        if _cb_state == _CB_HALF_OPEN:
            # Probe failed — back to OPEN with doubled cooldown
            _cb_cooldown_sec = min(_cb_cooldown_sec * 2, _CB_MAX_COOLDOWN)
            _cb_state = _CB_OPEN
            _cb_open_since = time.monotonic()
            _cb_trip_count += 1
            logger.warning(
                "S2 circuit breaker → OPEN (probe failed). "
                "Next cooldown: %.0fs (trip #%d)",
                _cb_cooldown_sec,
                _cb_trip_count,
            )
            return True

        if _cb_consecutive_429s >= _CB_THRESHOLD:
            _cb_state = _CB_OPEN
            _cb_open_since = time.monotonic()
            _cb_trip_count += 1
            logger.warning(
                "S2 circuit breaker TRIPPED after %d consecutive 429s. "
                "Cooldown: %.0fs (trip #%d). arXiv still active.",
                _cb_consecutive_429s,
                _cb_cooldown_sec,
                _cb_trip_count,
            )
            return True
        return False


# Last request timestamp for rate limiting
_last_request_time: float = 0.0
_rate_lock = threading.Lock()


def search_semantic_scholar(
    query: str,
    *,
    limit: int = 20,
    year_min: int = 0,
    api_key: str = "",
) -> list[Paper]:
    """Search Semantic Scholar for papers matching *query*.

    Parameters
    ----------
    query:
        Free-text search query.
    limit:
        Maximum number of results (capped at 100 per API constraint).
    year_min:
        If >0, restrict to papers published in this year or later.
    api_key:
        Optional S2 API key (raises rate limit to 10 req/s).

    Returns
    -------
    list[Paper]
        Parsed papers.  Empty list on network failure.
    """
    global _last_request_time  # noqa: PLW0603

    # Rate limiting: locked to serialize concurrent callers
    with _rate_lock:
        now = time.monotonic()
        rate_limit = 0.3 if api_key else _RATE_LIMIT_SEC
        elapsed_since_last = now - _last_request_time
        if elapsed_since_last < rate_limit:
            time.sleep(rate_limit - elapsed_since_last)
        _last_request_time = time.monotonic()

    limit = min(limit, _MAX_PER_REQUEST)
    params: dict[str, str] = {
        "query": query,
        "limit": str(limit),
        "fields": _FIELDS,
    }
    if year_min > 0:
        params["year"] = f"{year_min}-"

    url = f"{_BASE_URL}?{urllib.parse.urlencode(params)}"

    headers: dict[str, str] = {"Accept": "application/json"}
    if api_key:
        headers["x-api-key"] = api_key
    data = _request_with_retry(url, headers)
    if data is None:
        return []

    raw_papers = data.get("data", [])
    if not isinstance(raw_papers, list):
        return []

    papers: list[Paper] = []
    for item in raw_papers:
        try:
            papers.append(_parse_s2_paper(item))
        except Exception:  # noqa: BLE001
            logger.debug("Failed to parse S2 paper entry: %s", item)
    return papers


# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------


def _request_with_retry(
    url: str,
    headers: dict[str, str],
) -> dict[str, Any] | None:
    """GET *url* with exponential back-off retries."""
    if not _cb_should_allow():
        return None

    for attempt in range(_MAX_RETRIES):
        try:
            req = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp:
                body = resp.read().decode("utf-8")
                _cb_on_success()
                return json.loads(body)
        except urllib.error.HTTPError as exc:
            if exc.code == 429:
                if _cb_on_429():
                    return None  # breaker tripped
                delay = min(2 ** (attempt + 1), _MAX_WAIT_SEC)
                jitter = random.uniform(0, delay * 0.3)
                wait = delay + jitter
                logger.warning(
                    "S2 rate-limited (429). Waiting %.1fs (attempt %d/%d)...",
                    wait,
                    attempt + 1,
                    _MAX_RETRIES,
                )
                time.sleep(wait)
                continue
            logger.warning("S2 HTTP %d for %s", exc.code, url)
            return None
        except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc:
            wait = min(2**attempt, _MAX_WAIT_SEC)
            jitter = random.uniform(0, wait * 0.2)
            logger.warning(
                "S2 request failed (%s). Retry %d/%d in %ds \u2026",
                exc,
                attempt + 1,
                _MAX_RETRIES,
                wait,
            )
            time.sleep(wait + jitter)
    logger.error("S2 request exhausted retries for: %s", url)
    return None


_BATCH_URL = "https://api.semanticscholar.org/graph/v1/paper/batch"
_BATCH_MAX = 500  # S2 batch endpoint max


def batch_fetch_papers(
    paper_ids: list[str],
    *,
    api_key: str = "",
    fields: str = _FIELDS,
) -> list[Paper]:
    """Batch fetch paper details via POST /graph/v1/paper/batch.

    Accepts S2 paper IDs, arXiv IDs (prefixed ``ARXIV:``), or DOIs.
    Returns parsed papers; silently skips papers that fail to resolve.
    """
    if not paper_ids:
        return []

    if not _cb_should_allow():
        return []

    global _last_request_time  # noqa: PLW0603
    rate = 0.3 if api_key else _RATE_LIMIT_SEC
    with _rate_lock:
        now = time.monotonic()
        elapsed = now - _last_request_time
        if elapsed < rate:
            time.sleep(rate - elapsed)
        _last_request_time = time.monotonic()

    all_papers: list[Paper] = []

    # Process in chunks of _BATCH_MAX
    for i in range(0, len(paper_ids), _BATCH_MAX):
        chunk = paper_ids[i : i + _BATCH_MAX]
        url = f"{_BATCH_URL}?fields={fields}"

        headers: dict[str, str] = {
            "Accept": "application/json",
            "Content-Type": "application/json",
        }
        if api_key:
            headers["x-api-key"] = api_key

        body = json.dumps({"ids": chunk}).encode("utf-8")

        result = _post_with_retry(url, headers, body)
        with _rate_lock:
            _last_request_time = time.monotonic()
        if result is None:
            continue

        for item in result:
            if item is None:
                continue  # unresolved ID
            try:
                all_papers.append(_parse_s2_paper(item))
            except Exception:  # noqa: BLE001
                logger.debug("Failed to parse batch S2 paper entry")

        # Delay between chunks (sleep outside lock to avoid contention)
        if i + _BATCH_MAX < len(paper_ids):
            time.sleep(rate)
            with _rate_lock:
                _last_request_time = time.monotonic()

    return all_papers


def _post_with_retry(
    url: str,
    headers: dict[str, str],
    body: bytes,
) -> list[dict[str, Any]] | None:
    """POST *url* with exponential back-off retries."""
    if not _cb_should_allow():
        return None

    for attempt in range(_MAX_RETRIES):
        try:
            req = urllib.request.Request(url, data=body, headers=headers, method="POST")
            with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp:
                data = json.loads(resp.read().decode("utf-8"))
                _cb_on_success()
                return data if isinstance(data, list) else None
        except urllib.error.HTTPError as exc:
            if exc.code == 429:
                if _cb_on_429():
                    return None
                delay = min(2 ** (attempt + 1), _MAX_WAIT_SEC)
                jitter = random.uniform(0, delay * 0.3)
                logger.warning(
                    "S2 batch rate-limited (429). Waiting %.1fs (attempt %d/%d)...",
                    delay + jitter,
                    attempt + 1,
                    _MAX_RETRIES,
                )
                time.sleep(delay + jitter)
                continue
            logger.warning("S2 batch HTTP %d", exc.code)
            return None
        except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc:
            wait = min(2**attempt, _MAX_WAIT_SEC)
            jitter = random.uniform(0, wait * 0.2)
            logger.warning(
                "S2 batch request failed (%s). Retry %d/%d in %ds…",
                exc,
                attempt + 1,
                _MAX_RETRIES,
                wait,
            )
            time.sleep(wait + jitter)

    logger.error("S2 batch request exhausted retries")
    return None


def _parse_s2_paper(item: dict[str, Any]) -> Paper:
    """Convert a single Semantic Scholar JSON entry to a ``Paper``."""
    ext_ids = item.get("externalIds") or {}
    authors_raw = item.get("authors") or []
    authors = tuple(
        Author(name=a.get("name", "Unknown"))
        for a in authors_raw
        if isinstance(a, dict)
    )
    return Paper(
        paper_id=f"s2-{item.get('paperId', '')}",
        title=str(item.get("title", "")).strip(),
        authors=authors,
        year=int(item.get("year") or 0),
        abstract=str(item.get("abstract") or "").strip(),
        venue=str(item.get("venue") or "").strip(),
        citation_count=int(item.get("citationCount") or 0),
        doi=str(ext_ids.get("DOI") or "").strip(),
        arxiv_id=str(ext_ids.get("ArXiv") or "").strip(),
        url=str(item.get("url") or "").strip(),
        source="semantic_scholar",
    )


================================================
FILE: researchclaw/literature/trends.py
================================================
"""Literature trend analysis — analyze trends from search results."""

from __future__ import annotations

import logging
from typing import Any

logger = logging.getLogger(__name__)


class LiteratureTrendAnalyzer:
    """Analyze trends from literature search results."""

    def __init__(self, search_client: Any = None):
        self.client = search_client

    def get_daily_papers(
        self,
        domains: list[str],
        max_papers: int = 20,
    ) -> list[dict[str, Any]]:
        """Get today's most relevant papers via literature search."""
        if self.client is None:
            return []

        try:
            from researchclaw.literature.search import search_papers

            query = " OR ".join(domains) if domains else "machine learning"
            papers = search_papers(query, limit=max_papers)
            return [
                {
                    "title": p.title,
                    "authors": [a.name for a in p.authors],
                    "abstract": p.abstract or "",
                    "url": p.url or "",
                    "year": p.year,
                    "citation_count": p.citation_count,
                    "source": p.source,
                }
                for p in papers
            ]
        except Exception as exc:
            logger.warning("Literature trend fetch failed: %s", exc)
            return []

    def analyze_keyword_trends(
        self,
        domains: list[str],
        window_days: int = 30,
    ) -> dict[str, Any]:
        """Analyze keyword frequency trends."""
        papers = self.get_daily_papers(domains)
        if not papers:
            return {"keywords": [], "total_papers": 0}

        from researchclaw.trends.trend_analyzer import TrendAnalyzer

        analyzer = TrendAnalyzer()
        analysis = analyzer.analyze(papers, window_days)
        return {
            "keywords": analysis.get("rising_keywords", []),
            "total_papers": len(papers),
            "methods": analysis.get("method_trends", []),
        }

    def find_emerging_topics(
        self,
        domains: list[str],
    ) -> list[dict[str, Any]]:
        """Discover emerging research directions."""
        papers = self.get_daily_papers(domains, max_papers=50)
        if not papers:
            return []

        from researchclaw.trends.trend_analyzer import TrendAnalyzer

        analyzer = TrendAnalyzer()
        analysis = analyzer.analyze(papers)
        keywords = analysis.get("rising_keywords", [])

        # Emerging topics = high-frequency bigrams
        emerging = [
            {
                "topic": kw["keyword"],
                "frequency": kw["count"],
                "type": kw.get("type", "unigram"),
            }
            for kw in keywords
            if kw.get("type") == "bigram" and kw["count"] >= 3
        ]
        return emerging[:10]


================================================
FILE: researchclaw/literature/verify.py
================================================
"""Citation verification engine — detect hallucinated references.

Verifies each BibTeX entry against real academic APIs using a three-layer
strategy:

  L1: **arXiv ID lookup** — direct ``id_list`` query to arXiv API
  L2: **DOI resolution** — HTTP GET to CrossRef ``/works/{doi}``
  L3: **Title search** — search Semantic Scholar + arXiv by title

Classifications:

  - ``VERIFIED``:      API confirms existence + title similarity ≥ 0.80
  - ``SUSPICIOUS``:    Found a paper but metadata diverges (0.50 ≤ sim < 0.80)
  - ``HALLUCINATED``:  Not found via any API or sim < 0.50
  - ``SKIPPED``:       Entry cannot be verified (no title, or all APIs unreachable)

All network I/O uses stdlib ``urllib`` — zero extra pip dependencies.
"""

from __future__ import annotations

import json
import logging
import re
import time
import urllib.error
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from enum import Enum
from typing import Sequence

from researchclaw.literature.models import Author, Paper

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Public enums & data classes
# ---------------------------------------------------------------------------


class VerifyStatus(str, Enum):
    """Verification outcome for a single citation."""

    VERIFIED = "verified"
    SUSPICIOUS = "suspicious"
    HALLUCINATED = "hallucinated"
    SKIPPED = "skipped"


@dataclass
class CitationResult:
    """Verification result for one BibTeX entry."""

    cite_key: str
    title: str
    status: VerifyStatus
    confidence: float  # 0.0–1.0
    method: str  # "arxiv_id" | "doi" | "title_search" | "skipped"
    details: str = ""
    matched_paper: Paper | None = None
    relevance_score: float | None = None  # 0.0–1.0, set by LLM relevance check

    def to_dict(self) -> dict[str, object]:
        d: dict[str, object] = {
            "cite_key": self.cite_key,
            "title": self.title,
            "status": self.status.value,
            "confidence": round(self.confidence, 3),
            "method": self.method,
            "details": self.details,
        }
        if self.relevance_score is not None:
            d["relevance_score"] = round(self.relevance_score, 2)
        if self.matched_paper:
            d["matched_paper"] = {
                "title": self.matched_paper.title,
                "authors": [a.name for a in self.matched_paper.authors],
                "year": self.matched_paper.year,
                "source": self.matched_paper.source,
            }
        return d


@dataclass
class VerificationReport:
    """Aggregate report for all citations in a paper."""

    total: int = 0
    verified: int = 0
    suspicious: int = 0
    hallucinated: int = 0
    skipped: int = 0
    results: list[CitationResult] = field(default_factory=list)

    @property
    def integrity_score(self) -> float:
        """Fraction of verifiable citations that are verified (0.0–1.0)."""
        verifiable = self.total - self.skipped
        if verifiable <= 0:
            return 1.0
        return round(self.verified / verifiable, 3)

    def to_dict(self) -> dict[str, object]:
        return {
            "summary": {
                "total": self.total,
                "verified": self.verified,
                "suspicious": self.suspicious,
                "hallucinated": self.hallucinated,
                "skipped": self.skipped,
                "integrity_score": self.integrity_score,
            },
            "results": [r.to_dict() for r in self.results],
        }


# ---------------------------------------------------------------------------
# BibTeX parsing
# ---------------------------------------------------------------------------

_ENTRY_RE = re.compile(
    r"@(\w+)\s*\{\s*([^,\s]+)\s*,\s*(.*?)\s*\}(?=\s*(?:@|\Z))",
    re.DOTALL,
)

_FIELD_RE = re.compile(
    r"(\w+)\s*=\s*\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}",
    re.DOTALL,
)


def parse_bibtex_entries(bib_text: str) -> list[dict[str, str]]:
    """Parse BibTeX text into a list of field dicts.

    Each dict contains at least ``key`` and ``type``, plus any parsed fields
    (``title``, ``author``, ``year``, ``doi``, ``eprint``, ``url``, …).
    """
    entries: list[dict[str, str]] = []
    for m in _ENTRY_RE.finditer(bib_text):
        entry: dict[str, str] = {
            "type": m.group(1).lower(),
            "key": m.group(2).strip(),
        }
        body = m.group(3)
        for fm in _FIELD_RE.finditer(body):
            entry[fm.group(1).lower()] = fm.group(2).strip()
        entries.append(entry)
    return entries


# ---------------------------------------------------------------------------
# Title similarity
# ---------------------------------------------------------------------------


def title_similarity(a: str, b: str) -> float:
    """Word-overlap Jaccard-ish similarity between two titles.

    Returns 0.0–1.0.  Uses max(len) as denominator so short titles don't
    inflate the score.
    """

    def _words(t: str) -> set[str]:
        return set(re.sub(r"[^a-z0-9\s]", "", t.lower()).split()) - {""}

    wa, wb = _words(a), _words(b)
    if not wa or not wb:
        return 0.0
    return len(wa & wb) / max(len(wa), len(wb))


# ---------------------------------------------------------------------------
# L1: arXiv ID verification
# ---------------------------------------------------------------------------

_ARXIV_API = "https://export.arxiv.org/api/query"
_ARXIV_NS = {"atom": "http://www.w3.org/2005/Atom"}
_ARXIV_TIMEOUT = 20


def verify_by_arxiv_id(arxiv_id: str, expected_title: str) -> CitationResult | None:
    """Look up a paper by arXiv ID and compare titles.

    Returns *None* on network failure so that the caller can fall through
    to the next verification layer.
    """
    # arXiv ID lookup uses id_list, not search_query
    params = urllib.parse.urlencode({"id_list": arxiv_id, "max_results": "1"})
    url = f"{_ARXIV_API}?{params}"

    try:
        req = urllib.request.Request(url, headers={"User-Agent": "ResearchClaw/0.1"})
        with urllib.request.urlopen(req, timeout=_ARXIV_TIMEOUT) as resp:
            data = resp.read().decode("utf-8")
    except Exception as exc:
        logger.debug("arXiv ID verification failed for %s: %s", arxiv_id, exc)
        return None

    try:
        root = ET.fromstring(data)
    except ET.ParseError:
        return None

    entries = root.findall("atom:entry", _ARXIV_NS)
    if not entries:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.HALLUCINATED,
            confidence=0.9,
            method="arxiv_id",
            details=f"arXiv ID {arxiv_id} not found in arXiv",
        )

    # arXiv returns an "error" entry when ID is invalid
    entry = entries[0]
    found_title_el = entry.find("atom:title", _ARXIV_NS)
    found_title = (
        (found_title_el.text or "").strip() if found_title_el is not None else ""
    )
    found_title = re.sub(r"\s+", " ", found_title)

    # Check for arXiv error responses (they return entry with id but title "Error")
    entry_id = entry.findtext("atom:id", "", _ARXIV_NS)
    if "api/errors" in entry_id or not found_title or found_title.lower() == "error":
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.HALLUCINATED,
            confidence=0.9,
            method="arxiv_id",
            details=f"arXiv ID {arxiv_id} returned error or empty response",
        )

    sim = title_similarity(expected_title, found_title)
    if sim >= 0.80:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.VERIFIED,
            confidence=sim,
            method="arxiv_id",
            details=f"Confirmed via arXiv: '{found_title}'",
        )
    elif sim >= 0.50:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="arxiv_id",
            details=f"arXiv ID exists but title differs (sim={sim:.2f}): '{found_title}'",
        )
    else:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="arxiv_id",
            details=f"arXiv ID exists but title mismatch (sim={sim:.2f}): '{found_title}'",
        )


# ---------------------------------------------------------------------------
# L2: DOI verification via CrossRef
# ---------------------------------------------------------------------------

_CROSSREF_API = "https://api.crossref.org/works"
_CROSSREF_TIMEOUT = 20
_DATACITE_API = "https://api.datacite.org/dois"
_DATACITE_TIMEOUT = 15


def _verify_doi_datacite(doi: str, expected_title: str) -> CitationResult | None:
    """Fallback DOI verification via DataCite API.

    arXiv DOIs (10.48550/arXiv.*) are registered with DataCite, not CrossRef.
    Returns *None* on network failure.
    """
    encoded_doi = urllib.parse.quote(doi, safe="")
    url = f"{_DATACITE_API}/{encoded_doi}"

    try:
        req = urllib.request.Request(
            url,
            headers={
                "User-Agent": "ResearchClaw/0.1",
                "Accept": "application/json",
            },
        )
        with urllib.request.urlopen(req, timeout=_DATACITE_TIMEOUT) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        if exc.code == 404:
            return CitationResult(
                cite_key="",
                title=expected_title,
                status=VerifyStatus.HALLUCINATED,
                confidence=0.9,
                method="doi",
                details=f"DOI {doi} not found via CrossRef or DataCite",
            )
        logger.debug("DataCite HTTP error for DOI %s: %s", doi, exc)
        return None
    except Exception as exc:
        logger.debug("DataCite verification failed for %s: %s", doi, exc)
        return None

    # Extract title from DataCite response
    attrs = body.get("data", {}).get("attributes", {})
    dc_titles = attrs.get("titles", [])
    found_title = dc_titles[0].get("title", "") if dc_titles else ""

    if not found_title:
        # DOI exists in DataCite but no title — still counts as verified
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.VERIFIED,
            confidence=0.85,
            method="doi",
            details=f"DOI {doi} resolves via DataCite (no title comparison)",
        )

    sim = title_similarity(expected_title, found_title)
    if sim >= 0.80:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.VERIFIED,
            confidence=sim,
            method="doi",
            details=f"Confirmed via DataCite: '{found_title}'",
        )
    elif sim >= 0.50:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="doi",
            details=f"DataCite DOI resolves but title differs (sim={sim:.2f}): '{found_title}'",
        )
    else:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="doi",
            details=f"DataCite DOI resolves but title mismatch (sim={sim:.2f}): '{found_title}'",
        )


def verify_by_doi(doi: str, expected_title: str) -> CitationResult | None:
    """Verify a DOI via CrossRef API, with DataCite fallback for arXiv DOIs.

    Returns *None* on network failure.
    """
    encoded_doi = urllib.parse.quote(doi, safe="")
    url = f"{_CROSSREF_API}/{encoded_doi}"

    try:
        req = urllib.request.Request(
            url,
            headers={
                "User-Agent": "ResearchClaw/0.1 (mailto:researchclaw@example.com)",
                "Accept": "application/json",
            },
        )
        with urllib.request.urlopen(req, timeout=_CROSSREF_TIMEOUT) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        if exc.code == 404:
            # CrossRef 404 — try DataCite for arXiv/DataCite DOIs
            if doi.startswith("10.48550/") or doi.startswith("10.5281/"):
                dc_result = _verify_doi_datacite(doi, expected_title)
                if dc_result is not None:
                    return dc_result
            return CitationResult(
                cite_key="",
                title=expected_title,
                status=VerifyStatus.HALLUCINATED,
                confidence=0.9,
                method="doi",
                details=f"DOI {doi} not found (HTTP 404)",
            )
        logger.debug("CrossRef HTTP error for DOI %s: %s", doi, exc)
        return None
    except Exception as exc:
        logger.debug("DOI verification failed for %s: %s", doi, exc)
        return None

    # Extract title from CrossRef response
    message = body.get("message", {})
    titles = message.get("title", [])
    found_title = titles[0] if titles else ""

    if not found_title:
        # DOI exists but no title in response — still counts as verified
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.VERIFIED,
            confidence=0.85,
            method="doi",
            details=f"DOI {doi} resolves via CrossRef (no title comparison)",
        )

    sim = title_similarity(expected_title, found_title)
    if sim >= 0.80:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.VERIFIED,
            confidence=sim,
            method="doi",
            details=f"Confirmed via CrossRef: '{found_title}'",
        )
    elif sim >= 0.50:
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="doi",
            details=f"DOI resolves but title differs (sim={sim:.2f}): '{found_title}'",
        )
    else:
        # DOI exists but title is very different — the DOI may be real
        # but the BibTeX entry may have wrong metadata
        return CitationResult(
            cite_key="",
            title=expected_title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=sim,
            method="doi",
            details=f"DOI resolves but title mismatch (sim={sim:.2f}): '{found_title}'",
        )


# ---------------------------------------------------------------------------
# L3-alt: OpenAlex title search (primary L3 source — higher rate limits)
# ---------------------------------------------------------------------------

_OPENALEX_API = "https://api.openalex.org/works"
_OPENALEX_TIMEOUT = 15
_OPENALEX_EMAIL = "researchclaw@users.noreply.github.com"


def verify_by_openalex(title: str) -> CitationResult | None:
    """Verify a paper via OpenAlex API (10K+ calls/day vs S2's ~1 req/s).

    Returns *None* only on network failure (allows fallthrough to S2).
    """
    params = urllib.parse.urlencode({
        "filter": "title.search:" + title.replace(",", " ").replace(":", " "),
        "per_page": "5",
        "mailto": _OPENALEX_EMAIL,
    })
    url = f"{_OPENALEX_API}?{params}"

    try:
        req = urllib.request.Request(
            url,
            headers={
                "User-Agent": f"ResearchClaw/0.1 (mailto:{_OPENALEX_EMAIL})",
                "Accept": "application/json",
            },
        )
        with urllib.request.urlopen(req, timeout=_OPENALEX_TIMEOUT) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except Exception as exc:
        logger.debug("OpenAlex search failed for %r: %s", title, exc)
        return None

    results = body.get("results", [])
    if not results:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.HALLUCINATED,
            confidence=0.7,
            method="openalex",
            details="No results found via OpenAlex",
        )

    best_sim = 0.0
    best_result = None
    for r in results:
        found_title = r.get("title", "")
        if found_title:
            sim = title_similarity(title, found_title)
            if sim > best_sim:
                best_sim = sim
                best_result = r

    if best_sim >= 0.80:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.VERIFIED,
            confidence=best_sim,
            method="openalex",
            details=f"Confirmed via OpenAlex: '{best_result.get('title', '')}'",
        )
    elif best_sim >= 0.50:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=best_sim,
            method="openalex",
            details=f"Partial match via OpenAlex (sim={best_sim:.2f}): '{best_result.get('title', '')}'",
        )
    else:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.HALLUCINATED,
            confidence=0.7,
            method="openalex",
            details="No close match found via OpenAlex",
        )


# ---------------------------------------------------------------------------
# Verification result cache (avoids re-verifying known papers)
# ---------------------------------------------------------------------------

import hashlib
from pathlib import Path

_CACHE_DIR = Path.home() / ".cache" / "researchclaw" / "citation_verify"


def _cache_key(title: str) -> str:
    return hashlib.sha256(title.lower().strip().encode()).hexdigest()[:16]


def _read_cache(title: str) -> CitationResult | None:
    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_file = _CACHE_DIR / f"{_cache_key(title)}.json"
    if cache_file.exists():
        try:
            data = json.loads(cache_file.read_text(encoding="utf-8"))
            return CitationResult(
                cite_key=data.get("cite_key", ""),
                title=data.get("title", title),
                status=VerifyStatus(data["status"]),
                confidence=data["confidence"],
                method=data["method"],
                details=data.get("details", ""),
            )
        except (json.JSONDecodeError, KeyError, ValueError):
            return None
    return None


def _write_cache(title: str, result: CitationResult) -> None:
    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_file = _CACHE_DIR / f"{_cache_key(title)}.json"
    cache_file.write_text(
        json.dumps(result.to_dict(), indent=2),
        encoding="utf-8",
    )


# ---------------------------------------------------------------------------
# L3: Title search via Semantic Scholar + arXiv
# ---------------------------------------------------------------------------


def verify_by_title_search(
    title: str,
    *,
    s2_api_key: str = "",
) -> CitationResult | None:
    """Search for a paper by title and verify its existence.

    Uses the unified ``search_papers`` function from our literature module.
    Returns *None* only on total network failure.
    """
    from researchclaw.literature.search import search_papers

    try:
        results = search_papers(
            title,
            limit=5,
            s2_api_key=s2_api_key,
            deduplicate=True,
        )
    except Exception as exc:
        logger.debug("Title search failed for %r: %s", title, exc)
        return None

    if not results:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.HALLUCINATED,
            confidence=0.7,
            method="title_search",
            details="No results found via Semantic Scholar + arXiv",
        )

    # Find best title match
    best_sim = 0.0
    best_paper: Paper | None = None
    for paper in results:
        sim = title_similarity(title, paper.title)
        if sim > best_sim:
            best_sim = sim
            best_paper = paper

    if best_sim >= 0.80:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.VERIFIED,
            confidence=best_sim,
            method="title_search",
            details=f"Found via search: '{best_paper.title}'" if best_paper else "",
            matched_paper=best_paper,
        )
    elif best_sim >= 0.50:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.SUSPICIOUS,
            confidence=best_sim,
            method="title_search",
            details=(
                f"Partial match (sim={best_sim:.2f}): '{best_paper.title}'"
                if best_paper
                else ""
            ),
            matched_paper=best_paper,
        )
    else:
        return CitationResult(
            cite_key="",
            title=title,
            status=VerifyStatus.HALLUCINATED,
            confidence=1.0 - best_sim,
            method="title_search",
            details=(
                f"Best match too weak (sim={best_sim:.2f}): '{best_paper.title}'"
                if best_paper
                else "No match found"
            ),
        )


# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------


def verify_citations(
    bib_text: str,
    *,
    s2_api_key: str = "",
    inter_verify_delay: float = 1.5,
) -> VerificationReport:
    """Verify all BibTeX entries against real academic APIs.

    Three-layer verification:

    1. If entry has ``eprint`` (arXiv ID) → arXiv API lookup
    2. If entry has ``doi`` → CrossRef API lookup
    3. Otherwise → title search via Semantic Scholar + arXiv

    Parameters
    ----------
    bib_text:
        Raw BibTeX string.
    s2_api_key:
        Optional Semantic Scholar API key for L3 title search.
    inter_verify_delay:
        Seconds to wait between API calls (rate limiting).
    """
    entries = parse_bibtex_entries(bib_text)
    report = VerificationReport(total=len(entries))

    # Adaptive delays: OpenAlex/CrossRef can be queried much faster than arXiv
    _DELAY_ARXIV = inter_verify_delay       # arXiv: conservative (1.5s default)
    _DELAY_CROSSREF = 0.3                   # CrossRef: 50 req/s polite pool
    _DELAY_OPENALEX = 0.2                   # OpenAlex: 10K/day
    api_call_count = 0

    # BUG-22: Global timeout — stop verifying after 5 minutes total
    _verify_start = time.monotonic()
    _VERIFY_TIMEOUT_SEC = 300  # 5 minutes

    for i, entry in enumerate(entries):
        # BUG-22: Check global timeout — mark remaining as SKIPPED
        if time.monotonic() - _verify_start > _VERIFY_TIMEOUT_SEC:
            logger.warning(
                "Verification timeout (%.0fs). Marking remaining %d/%d "
                "citations as SKIPPED.",
                _VERIFY_TIMEOUT_SEC, len(entries) - i, len(entries),
            )
            for remaining_entry in entries[i:]:
                _rkey = remaining_entry.get("key", f"unknown_{i}")
                _rtitle = remaining_entry.get("title", "")
                report.results.append(CitationResult(
                    cite_key=_rkey,
                    title=_rtitle,
                    status=VerifyStatus.SKIPPED,
                    confidence=0.0,
                    method="skipped",
                    details="Verification timeout exceeded",
                ))
                report.skipped += 1
            break

        key = entry.get("key", f"unknown_{i}")
        title = entry.get("title", "")
        arxiv_id = entry.get("eprint", "")
        doi = entry.get("doi", "")

        # Skip entries with no title
        if not title:
            result = CitationResult(
                cite_key=key,
                title="",
                status=VerifyStatus.SKIPPED,
                confidence=0.0,
                method="skipped",
                details="No title in BibTeX entry",
            )
            report.skipped += 1
            report.results.append(result)
            continue

        # Check cache first
        cached = _read_cache(title)
        if cached is not None:
            cached.cite_key = key
            report.results.append(cached)
            if cached.status == VerifyStatus.VERIFIED:
                report.verified += 1
            elif cached.status == VerifyStatus.SUSPICIOUS:
                report.suspicious += 1
            elif cached.status == VerifyStatus.HALLUCINATED:
                report.hallucinated += 1
            else:
                report.skipped += 1
            logger.debug("[cache] verify HIT [%s] %r → %s", key, title[:50], cached.status.value)
            continue

        result: CitationResult | None = None

        # Verification order optimized to minimize arXiv API pressure:
        #   DOI → CrossRef (fast, high limit)
        #   > OpenAlex title search (10K/day)
        #   > arXiv ID lookup (only if others fail, 1/3s)
        #   > S2 title search (last resort)

        # L2 first: DOI verification via CrossRef (fast, generous limits)
        if result is None and doi:
            if api_call_count > 0:
                time.sleep(_DELAY_CROSSREF)
            result = verify_by_doi(doi, title)
            api_call_count += 1
            if result is not None:
                logger.info(
                    "L2 DOI [%s] %s → %s (%.2f)",
                    key,
                    doi,
                    result.status.value,
                    result.confidence,
                )

        # L3a: OpenAlex title search (high rate limits, good coverage)
        if result is None:
            if api_call_count > 0:
                time.sleep(_DELAY_OPENALEX)
            result = verify_by_openalex(title)
            api_call_count += 1
            if result is not None:
                logger.info(
                    "L3a OpenAlex [%s] %r → %s (%.2f)",
                    key,
                    title[:50],
                    result.status.value,
                    result.confidence,
                )

        # L1: arXiv ID — only if DOI and OpenAlex both failed
        if result is None and arxiv_id:
            if api_call_count > 0:
                time.sleep(_DELAY_ARXIV)
            result = verify_by_arxiv_id(arxiv_id, title)
            api_call_count += 1
            if result is not None:
                logger.info(
                    "L1 arXiv ID [%s] %s → %s (%.2f)",
                    key,
                    arxiv_id,
                    result.status.value,
                    result.confidence,
                )

        # L3b: S2 title search — last resort fallback
        if result is None:
            result = verify_by_title_search(title, s2_api_key=s2_api_key)
            api_call_count += 1
            if result is not None:
                logger.info(
                    "L3b S2 [%s] %r → %s (%.2f)",
                    key,
                    title[:50],
                    result.status.value,
                    result.confidence,
                )

        # Fallback: all layers failed (network issues)
        if result is None:
            result = CitationResult(
                cite_key=key,
                title=title,
                status=VerifyStatus.SKIPPED,
                confidence=0.0,
                method="skipped",
                details="All verification methods failed (network error?)",
            )


        result = CitationResult(
            cite_key=key,
            title=result.title,
            status=result.status,
            confidence=result.confidence,
            method=result.method,
            details=result.details,
            matched_paper=result.matched_paper,
        )

        # Cache the result (skip SKIPPED — network failures shouldn't be cached)
        if result.status != VerifyStatus.SKIPPED:
            _write_cache(title, result)

        if result.status == VerifyStatus.VERIFIED:
            report.verified += 1
        elif result.status == VerifyStatus.SUSPICIOUS:
            report.suspicious += 1
        elif result.status == VerifyStatus.HALLUCINATED:
            report.hallucinated += 1
        else:
            report.skipped += 1

        report.results.append(result)

    return report


# ---------------------------------------------------------------------------
# Post-processing helpers
# ---------------------------------------------------------------------------


def filter_verified_bibtex(
    bib_text: str,
    report: VerificationReport,
    *,
    include_suspicious: bool = True,
) -> str:
    """Return a cleaned BibTeX string with only verified entries.

    Parameters
    ----------
    bib_text:
        Original BibTeX string.
    report:
        Verification report from ``verify_citations()``.
    include_suspicious:
        If True, keep SUSPICIOUS entries.  If False, only keep VERIFIED.
    """
    # Build set of keys to keep
    keep_keys: set[str] = set()
    for r in report.results:
        if r.status == VerifyStatus.VERIFIED:
            keep_keys.add(r.cite_key)
        elif r.status == VerifyStatus.SUSPICIOUS and include_suspicious:
            keep_keys.add(r.cite_key)
        elif r.status == VerifyStatus.SKIPPED:
            keep_keys.add(r.cite_key)  # keep unverifiable entries

    # Rebuild BibTeX keeping only entries whose keys are in keep_keys
    kept: list[str] = []
    for m in _ENTRY_RE.finditer(bib_text):
        key = m.group(2).strip()
        if key in keep_keys:
            kept.append(m.group(0))

    return "\n\n".join(kept) + "\n" if kept else ""


def annotate_paper_hallucinations(
    paper_text: str,
    report: VerificationReport,
) -> str:
    """Remove hallucinated citations from paper text.

    - HALLUCINATED citations: removed from text (recorded in verification report)
    - SUSPICIOUS/VERIFIED/SKIPPED: left as-is

    Works with both ``\\cite{key}`` (LaTeX) and ``[key]`` (Markdown) formats.
    """
    hallucinated_keys: set[str] = set()
    for r in report.results:
        if r.status == VerifyStatus.HALLUCINATED:
            hallucinated_keys.add(r.cite_key)

    if not hallucinated_keys:
        return paper_text

    result = paper_text

    # Handle \cite{key1, key2} format — remove only hallucinated keys
    def _replace_latex(m: re.Match[str]) -> str:
        keys = [k.strip() for k in m.group(1).split(",")]
        kept = [k for k in keys if k not in hallucinated_keys]
        if not kept:
            return ""  # All keys hallucinated — remove entire cite
        return "\\cite{" + ", ".join(kept) + "}"

    result = re.sub(r"\\cite\{([^}]+)\}", _replace_latex, result)

    # Handle [key1, key2] and [key1; key2] format (Markdown multi-key)
    _CITE_KEY_PAT = r"[a-zA-Z]+\d{4}[a-zA-Z]*"

    def _replace_markdown_multi(m: re.Match[str]) -> str:
        keys = [k.strip() for k in re.split(r"[,;]\s*", m.group(1))]
        kept = [k for k in keys if k not in hallucinated_keys]
        if not kept:
            return ""
        return "[" + ", ".join(kept) + "]"

    result = re.sub(
        rf"\[({_CITE_KEY_PAT}(?:\s*[,;]\s*{_CITE_KEY_PAT})*)\]",
        _replace_markdown_multi,
        result,
    )

    # Clean up artifacts: double spaces, empty parenthetical citations, orphan punctuation
    result = re.sub(r" {2,}", " ", result)
    result = re.sub(r"\(\s*\)", "", result)
    result = re.sub(r"\[\s*\]", "", result)

    return result


================================================
FILE: researchclaw/llm/__init__.py
================================================
"""LLM integration — OpenAI-compatible and ACP agent clients."""

from __future__ import annotations

from typing import TYPE_CHECKING, Union

if TYPE_CHECKING:
    from researchclaw.config import RCConfig
    from researchclaw.llm.acp_client import ACPClient
    from researchclaw.llm.client import LLMClient

# Provider presets for common LLM services
PROVIDER_PRESETS = {
    "openai": {
        "base_url": "https://api.openai.com/v1",
    },
    "openrouter": {
        "base_url": "https://openrouter.ai/api/v1",
    },
    "deepseek": {
        "base_url": "https://api.deepseek.com/v1",
    },
    "anthropic": {
        "base_url": "https://api.anthropic.com",
    },
    "kimi-anthropic": {
        "base_url": "https://api.kimi.com/coding/",
    },
    "novita": {
        "base_url": "https://api.novita.ai/openai",
    },
    "minimax": {
        "base_url": "https://api.minimax.io/v1",
    },
    "openai-compatible": {
        "base_url": None,  # Use user-provided base_url
    },
}


def create_llm_client(config: RCConfig) -> LLMClient | ACPClient:
    """Factory: return the right LLM client based on ``config.llm.provider``.

    - ``"acp"`` → :class:`ACPClient` (spawns an ACP-compatible agent)
    - ``"anthropic"`` → :class:`LLMClient` with Anthropic Messages API adapter
    - ``"kimi-anthropic"`` → :class:`LLMClient` with Kimi Coding Anthropic adapter
    - ``"openrouter"`` → :class:`LLMClient` with OpenRouter base URL
    - ``"openai"`` → :class:`LLMClient` with OpenAI base URL
    - ``"deepseek"`` → :class:`LLMClient` with DeepSeek base URL
    - ``"novita"`` → :class:`LLMClient` with Novita AI base URL
    - ``"minimax"`` → :class:`LLMClient` with MiniMax base URL
    - ``"openai-compatible"`` (default) → :class:`LLMClient` with custom base_url

    OpenRouter is fully compatible with the OpenAI API format, making it
    a drop-in replacement with access to 200+ models from Anthropic, Google,
    Meta, Mistral, and more. See: https://openrouter.ai/models
    """
    if config.llm.provider == "acp":
        from researchclaw.llm.acp_client import ACPClient as _ACP
        return _ACP.from_rc_config(config)

    from researchclaw.llm.client import LLMClient as _LLM

    # Use from_rc_config to properly initialize adapters (e.g., Anthropic)
    return _LLM.from_rc_config(config)


================================================
FILE: researchclaw/llm/acp_client.py
================================================
"""ACP (Agent Client Protocol) LLM client via acpx.

Uses acpx as the ACP bridge to communicate with any ACP-compatible agent
(Claude Code, Codex, Gemini CLI, etc.) via persistent named sessions.

Key advantage: a single persistent session maintains context across all
23 pipeline stages — the agent remembers everything.
"""

from __future__ import annotations

import atexit
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
import weakref
from dataclasses import dataclass
from typing import Any

from researchclaw.llm.client import LLMResponse

logger = logging.getLogger(__name__)

# acpx output markers
_DONE_RE = re.compile(r"^\[done\]")
_CLIENT_RE = re.compile(r"^\[client\]")
_ACPX_RE = re.compile(r"^\[acpx\]")
_TOOL_RE = re.compile(r"^\[tool\]")


@dataclass
class ACPConfig:
    """Configuration for ACP agent connection."""

    agent: str = "claude"
    cwd: str = "."
    acpx_command: str = ""  # auto-detect if empty
    session_name: str = "researchclaw"
    timeout_sec: int = 1800  # per-prompt timeout


def _find_acpx() -> str | None:
    """Find the acpx binary — check PATH, then OpenClaw's plugin directory."""
    found = shutil.which("acpx")
    if found:
        return found
    # Check OpenClaw's bundled acpx plugin
    openclaw_acpx = os.path.expanduser(
        "~/.openclaw/extensions/acpx/node_modules/.bin/acpx"
    )
    if os.path.isfile(openclaw_acpx) and os.access(openclaw_acpx, os.X_OK):
        return openclaw_acpx
    return None


class ACPClient:
    """LLM client that uses acpx to communicate with ACP agents.

    Spawns persistent named sessions via acpx, reusing them across
    ``.chat()`` calls so the agent maintains context across the full
    23-stage pipeline.
    """

    # Track live instances for atexit cleanup (weak refs to avoid preventing GC)
    _live_instances: list[weakref.ref[ACPClient]] = []

    def __init__(self, acp_config: ACPConfig) -> None:
        self.config = acp_config
        self._acpx: str | None = acp_config.acpx_command or None
        self._session_ready = False
        # Register for atexit cleanup to prevent zombie acpx processes
        ACPClient._live_instances.append(weakref.ref(self))
        atexit.register(ACPClient._atexit_cleanup)

    @classmethod
    def from_rc_config(cls, rc_config: Any) -> ACPClient:
        """Build from a ResearchClaw ``RCConfig``."""
        acp = rc_config.llm.acp
        return cls(ACPConfig(
            agent=acp.agent,
            cwd=acp.cwd,
            acpx_command=getattr(acp, "acpx_command", ""),
            session_name=getattr(acp, "session_name", "researchclaw"),
            timeout_sec=getattr(acp, "timeout_sec", 1800),
        ))

    # ------------------------------------------------------------------
    # Public interface (matches LLMClient)
    # ------------------------------------------------------------------

    def chat(
        self,
        messages: list[dict[str, str]],
        *,
        model: str | None = None,
        max_tokens: int | None = None,
        temperature: float | None = None,
        json_mode: bool = False,
        system: str | None = None,
        strip_thinking: bool = False,
    ) -> LLMResponse:
        """Send a prompt and return the agent's response.

        Parameters mirror ``LLMClient.chat()`` for drop-in compatibility.
        ``model``, ``max_tokens``, ``temperature``, and ``json_mode`` are
        accepted but not forwarded — the agent manages its own model and
        parameters.
        """
        prompt_text = self._messages_to_prompt(messages, system=system)
        content = self._send_prompt(prompt_text)
        if strip_thinking:
            from researchclaw.utils.thinking_tags import strip_thinking_tags
            content = strip_thinking_tags(content)
        return LLMResponse(
            content=content,
            model=f"acp:{self.config.agent}",
            finish_reason="stop",
        )

    def preflight(self) -> tuple[bool, str]:
        """Check that acpx and the agent are available."""
        acpx = self._resolve_acpx()
        if not acpx:
            return False, (
                "acpx not found. Install it: npm install -g acpx  "
                "or set llm.acp.acpx_command in config."
            )
        # Check the agent binary exists
        agent = self.config.agent
        if not shutil.which(agent):
            return False, f"ACP agent CLI not found: {agent!r} (not on PATH)"
        # Create the session
        try:
            self._ensure_session()
            return True, f"OK - ACP session ready ({agent} via acpx)"
        except Exception as exc:  # noqa: BLE001
            return False, f"ACP session init failed: {exc}"

    def close(self) -> None:
        """Close the acpx session."""
        if not self._session_ready:
            return
        acpx = self._resolve_acpx()
        if not acpx:
            return
        try:
            subprocess.run(
                [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
                 self.config.agent, "sessions", "close",
                 self.config.session_name],
                capture_output=True, timeout=15,
            )
        except Exception:  # noqa: BLE001
            pass
        self._session_ready = False

    def __del__(self) -> None:
        """Best-effort cleanup on garbage collection."""
        try:
            self.close()
        except Exception:  # noqa: BLE001
            pass

    @classmethod
    def _atexit_cleanup(cls) -> None:
        """Close all live ACP sessions on interpreter shutdown."""
        for ref in cls._live_instances:
            inst = ref()
            if inst is not None:
                try:
                    inst.close()
                except Exception:  # noqa: BLE001
                    pass
        cls._live_instances.clear()

    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------

    def _resolve_acpx(self) -> str | None:
        """Resolve the acpx binary path (cached)."""
        if self._acpx:
            return self._acpx
        self._acpx = _find_acpx()
        return self._acpx

    def _abs_cwd(self) -> str:
        return os.path.abspath(self.config.cwd)

    def _ensure_session(self) -> None:
        """Find or create the named acpx session."""
        if self._session_ready:
            return
        acpx = self._resolve_acpx()
        if not acpx:
            raise RuntimeError("acpx not found")

        # Use 'ensure' which finds existing or creates new
        result = subprocess.run(
            [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
             self.config.agent, "sessions", "ensure",
             "--name", self.config.session_name],
            capture_output=True, text=True, encoding="utf-8",
            errors="replace", timeout=30,
        )
        if result.returncode != 0:
            # Fall back to 'new'
            result = subprocess.run(
                [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
                 self.config.agent, "sessions", "new",
                 "--name", self.config.session_name],
                capture_output=True, text=True, encoding="utf-8",
                errors="replace", timeout=30,
            )
            if result.returncode != 0:
                raise RuntimeError(
                    f"Failed to create ACP session: {result.stderr.strip()}"
                )
        self._session_ready = True
        logger.info("ACP session '%s' ready (%s)", self.config.session_name, self.config.agent)

    # Linux MAX_ARG_STRLEN is 128 KB; Windows CreateProcess limit is ~32 KB.
    _MAX_CLI_PROMPT_BYTES = 30_000 if sys.platform == "win32" else 100_000

    # Localized error snippets for "command line too long" (may be in any OS language)
    _CMD_TOO_LONG_HINTS = (
        "too long",       # English Windows
        "trop long",      # French Windows
        "zu lang",        # German Windows
        "demasiado larg", # Spanish Windows
        "e2big",          # POSIX
    )

    # Error patterns that indicate a dead/stale session (retryable)
    _RECONNECT_ERRORS = (
        "agent needs reconnect",
        "session not found",
        "Query closed",
    )
    _MAX_RECONNECT_ATTEMPTS = 2

    def _send_prompt(self, prompt: str) -> str:
        """Send a prompt via acpx and return the response text.

        For large prompts that would exceed the OS argument-length limit
        (``E2BIG``), the prompt is written to a temp file and the agent
        is asked to read it.

        If the session has died (common after long-running stages), retries
        up to ``_MAX_RECONNECT_ATTEMPTS`` times with automatic reconnection.
        """
        acpx = self._resolve_acpx()
        if not acpx:
            raise RuntimeError("acpx not found")

        prompt_bytes = len(prompt.encode("utf-8"))
        use_file = prompt_bytes > self._MAX_CLI_PROMPT_BYTES
        if use_file:
            logger.info(
                "Prompt too large for CLI arg (%d bytes). Using temp file.",
                prompt_bytes,
            )

        last_exc: RuntimeError | None = None
        for attempt in range(1 + self._MAX_RECONNECT_ATTEMPTS):
            self._ensure_session()
            try:
                if use_file:
                    return self._send_prompt_via_file(acpx, prompt)
                return self._send_prompt_cli(acpx, prompt)
            except OSError as os_exc:
                # OS-level failure (e.g., Windows CreateProcess arg limit).
                # Fall back to temp-file transport automatically.
                if not use_file:
                    logger.warning(
                        "CLI subprocess raised OSError, "
                        "falling back to temp file: %s",
                        os_exc,
                    )
                    use_file = True
                    return self._send_prompt_via_file(acpx, prompt)
                raise RuntimeError(
                    f"ACP prompt failed: {os_exc}"
                ) from os_exc
            except RuntimeError as exc:
                # Detect localized "command line too long" from subprocess stderr
                exc_lower = str(exc).lower()
                if not use_file and any(
                    h in exc_lower for h in self._CMD_TOO_LONG_HINTS
                ):
                    logger.warning(
                        "CLI prompt too long for OS, "
                        "falling back to temp file: %s",
                        exc,
                    )
                    use_file = True
                    return self._send_prompt_via_file(acpx, prompt)
                if not any(pat in str(exc) for pat in self._RECONNECT_ERRORS):
                    raise
                last_exc = exc
                if attempt < self._MAX_RECONNECT_ATTEMPTS:
                    logger.warning(
                        "ACP session died (%s), reconnecting (attempt %d/%d)...",
                        exc,
                        attempt + 1,
                        self._MAX_RECONNECT_ATTEMPTS,
                    )
                    self._force_reconnect()

        raise last_exc  # type: ignore[misc]

    def _force_reconnect(self) -> None:
        """Close the stale session and reset so _ensure_session creates a new one."""
        try:
            self.close()
        except Exception:  # noqa: BLE001
            pass
        self._session_ready = False

    def _send_prompt_cli(self, acpx: str, prompt: str) -> str:
        """Send prompt as a CLI argument (original path)."""
        try:
            result = subprocess.run(
                [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(),
                 self.config.agent, "-s", self.config.session_name,
                 prompt],
                capture_output=True, text=True, encoding="utf-8",
                errors="replace", timeout=self.config.timeout_sec,
            )
        except subprocess.TimeoutExpired as exc:
            raise RuntimeError(
                f"ACP prompt timed out after {self.config.timeout_sec}s"
            ) from exc

        if result.returncode != 0:
            stderr = (result.stderr or "").strip()
            raise RuntimeError(f"ACP prompt failed (exit {result.returncode}): {stderr}")

        return self._extract_response(result.stdout)

    def _send_prompt_via_file(self, acpx: str, prompt: str) -> str:
        """Write prompt to a temp file, ask the agent to read and respond."""
        fd, prompt_path = tempfile.mkstemp(
            suffix=".md", prefix="rc_prompt_",
        )
        try:
            with os.fdopen(fd, "w", encoding="utf-8") as f:
                f.write(prompt)

            short_prompt = (
                f"Read the file at {prompt_path} in its entirety. "
                f"Follow ALL instructions contained in that file and "
                f"respond exactly as requested. Do NOT summarize, "
                f"just produce the requested output."
            )

            try:
                result = subprocess.run(
                    [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(),
                     self.config.agent, "-s", self.config.session_name,
                     short_prompt],
                    capture_output=True, text=True, encoding="utf-8",
                    errors="replace", timeout=self.config.timeout_sec,
                )
            except subprocess.TimeoutExpired as exc:
                raise RuntimeError(
                    f"ACP prompt timed out after {self.config.timeout_sec}s"
                ) from exc

            if result.returncode != 0:
                stderr = (result.stderr or "").strip()
                raise RuntimeError(
                    f"ACP prompt failed (exit {result.returncode}): {stderr}"
                )

            return self._extract_response(result.stdout)
        finally:
            try:
                os.unlink(prompt_path)
            except OSError:
                pass

    @staticmethod
    def _extract_response(raw_output: str | None) -> str:
        """Extract the agent's actual response from acpx output.

        Strips acpx metadata lines ([client], [acpx], [tool], [done])
        and their continuation lines (indented or sub-field lines like
        ``input:``, ``output:``, ``files:``, ``kind:``).
        """
        if not raw_output:
            return ""
        lines: list[str] = []
        in_tool_block = False
        for line in raw_output.splitlines():
            # Skip acpx control lines
            if _DONE_RE.match(line) or _CLIENT_RE.match(line) or _ACPX_RE.match(line):
                in_tool_block = False
                continue
            if _TOOL_RE.match(line):
                in_tool_block = True
                continue
            # Tool blocks have indented continuation lines
            if in_tool_block:
                if line.startswith("  ") or not line.strip():
                    continue
                # Non-indented, non-empty line = end of tool block
                in_tool_block = False
            # Skip empty lines at start
            if not lines and not line.strip():
                continue
            lines.append(line)

        # Trim trailing empty lines
        while lines and not lines[-1].strip():
            lines.pop()

        return "\n".join(lines)

    @staticmethod
    def _messages_to_prompt(
        messages: list[dict[str, str]],
        *,
        system: str | None = None,
    ) -> str:
        """Flatten a chat-messages list into a single text prompt.

        Preserves role labels so the agent can distinguish context.
        """
        parts: list[str] = []
        if system:
            parts.append(f"[System]\n{system}")
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            if role == "system":
                parts.append(f"[System]\n{content}")
            elif role == "assistant":
                parts.append(f"[Previous Response]\n{content}")
            else:
                parts.append(content)
        return "\n\n".join(parts)


================================================
FILE: researchclaw/llm/anthropic_adapter.py
================================================
"""Anthropic Messages API adapter for ResearchClaw."""

import json
import logging
import urllib.error
from typing import Any

try:
    import httpx

    HAS_HTTPX = True
except ImportError:
    HAS_HTTPX = False

logger = logging.getLogger(__name__)

_JSON_MODE_INSTRUCTION = (
    "You MUST respond with valid JSON only. "
    "Do not include any text outside the JSON object."
)

# Map Anthropic stop_reason → OpenAI finish_reason
_STOP_REASON_MAP = {
    "end_turn": "stop",
    "max_tokens": "length",
    "stop_sequence": "stop",
    "tool_use": "tool_calls",
}


class AnthropicAdapter:
    """Adapter to call Anthropic Messages API and return OpenAI-compatible response."""

    def __init__(self, base_url: str, api_key: str, timeout_sec: int = 300):
        if not HAS_HTTPX:
            raise ImportError(
                "httpx is required for Anthropic adapter. Install: pip install httpx"
            )
        self.base_url = base_url.rstrip("/")
        self.api_key = api_key
        self.timeout_sec = timeout_sec
        self._client: httpx.Client | None = None

    def close(self) -> None:
        """BUG-DA8-09: Close the httpx connection pool to prevent fd leaks."""
        if self._client is not None:
            try:
                self._client.close()
            except Exception:  # noqa: BLE001
                pass
            self._client = None

    def chat_completion(
        self,
        model: str,
        messages: list[dict[str, str]],
        max_tokens: int,
        temperature: float,
        json_mode: bool = False,
    ) -> dict[str, Any]:
        """Call Anthropic Messages API and return OpenAI-compatible response.

        Raises urllib.error.HTTPError on API errors so the upstream retry
        logic in LLMClient._call_with_retry works unchanged.
        """
        # Extract and concatenate all system messages
        system_parts: list[str] = []
        user_messages: list[dict[str, str]] = []
        for msg in messages:
            if msg["role"] == "system":
                system_parts.append(msg["content"])
            else:
                user_messages.append(msg)

        system_msg = "\n\n".join(system_parts) if system_parts else None

        # Merge consecutive messages with the same role (Anthropic
        # requires strict user/assistant alternation)
        merged: list[dict[str, str]] = []
        for msg in user_messages:
            if merged and merged[-1]["role"] == msg["role"]:
                merged[-1] = {
                    "role": msg["role"],
                    "content": merged[-1]["content"] + "\n\n" + msg["content"],
                }
            else:
                merged.append(dict(msg))
        user_messages = merged

        # Ensure at least one user message and that it starts with "user"
        if not user_messages:
            user_messages = [{"role": "user", "content": "Hello."}]
        elif user_messages[0]["role"] != "user":
            user_messages.insert(0, {"role": "user", "content": "Continue."})

        # Prepend JSON instruction when json_mode is requested
        if json_mode:
            system_msg = (
                f"{_JSON_MODE_INSTRUCTION}\n\n{system_msg}"
                if system_msg
                else _JSON_MODE_INSTRUCTION
            )

        # BUG-DA8-05: Thinking-enabled Claude models require temperature=1.0
        # and do not accept other temperature values.
        _THINKING_MODELS = ("claude-3-7", "claude-4")
        _is_thinking = any(model.startswith(p) for p in _THINKING_MODELS)
        if _is_thinking:
            temperature = 1.0

        # Build Anthropic request
        body: dict[str, Any] = {
            "model": model,
            "messages": user_messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
        if system_msg:
            body["system"] = system_msg

        url = f"{self.base_url}/v1/messages"
        headers = {
            "x-api-key": self.api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
        }

        try:
            if self._client is None:
                self._client = httpx.Client(timeout=self.timeout_sec)
            response = self._client.post(url, headers=headers, json=body)
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as exc:
            # Convert to urllib.error.HTTPError for upstream retry logic.
            # Include Anthropic's error body so upstream logs show the
            # actual reason (e.g. "Prefilling not supported").
            detail = ""
            try:
                detail = exc.response.text[:500]
            except Exception:  # noqa: BLE001
                pass
            msg = f"{exc}: {detail}" if detail else str(exc)
            raise urllib.error.HTTPError(
                url,
                exc.response.status_code,
                msg,
                dict(exc.response.headers),
                None,
            ) from exc
        except httpx.HTTPError as exc:
            # Catch all transport errors (ConnectError, TimeoutException,
            # ReadError, RemoteProtocolError, PoolTimeout, etc.)
            raise urllib.error.URLError(str(exc)) from exc

        # Check for Anthropic error responses
        if data.get("type") == "error" or "error" in data:
            error_info = data.get("error", {})
            raise urllib.error.HTTPError(
                url,
                500,
                f"{error_info.get('type', 'api_error')}: {error_info.get('message', str(data))}",
                {},
                None,
            )

        # Extract ALL text content blocks (not just the first)
        content = ""
        if "content" in data and data["content"]:
            text_parts = [
                block.get("text", "")
                for block in data["content"]
                if block.get("type") == "text"
            ]
            content = "\n".join(text_parts)

        # Map Anthropic stop_reason to OpenAI finish_reason
        raw_stop_reason = data.get("stop_reason", "end_turn")
        finish_reason = _STOP_REASON_MAP.get(raw_stop_reason, "stop")

        return {
            "choices": [
                {
                    "message": {"role": "assistant", "content": content},
                    "finish_reason": finish_reason,
                }
            ],
            "usage": {
                "prompt_tokens": data.get("usage", {}).get("input_tokens", 0),
                "completion_tokens": data.get("usage", {}).get("output_tokens", 0),
                "total_tokens": (
                    data.get("usage", {}).get("input_tokens", 0)
                    + data.get("usage", {}).get("output_tokens", 0)
                ),
            },
            "model": data.get("model", model),
        }


================================================
FILE: researchclaw/llm/client.py
================================================
"""Lightweight OpenAI-compatible LLM client — stdlib only.

Features:
  - Model fallback chain (gpt-5.2 → gpt-5.1 → gpt-4.1 → gpt-4o)
  - Auto-detect max_tokens vs max_completion_tokens per model
  - Cloudflare User-Agent bypass
  - Exponential backoff retry with jitter
  - JSON mode support
  - Streaming disabled (sync only)
"""

from __future__ import annotations

import json
import logging
import os
import time
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)

# Models that require max_completion_tokens instead of max_tokens
_NEW_PARAM_MODELS = frozenset(
    {
        "o3",
        "o3-mini",
        "o4-mini",
        "gpt-5",
        "gpt-5.1",
        "gpt-5.2",
        "gpt-5.4",
    }
)

_DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)


@dataclass
class LLMResponse:
    """Parsed response from the LLM API."""

    content: str
    model: str
    prompt_tokens: int = 0
    completion_tokens: int = 0
    total_tokens: int = 0
    finish_reason: str = ""
    truncated: bool = False
    raw: dict[str, Any] = field(default_factory=dict)


@dataclass
class LLMConfig:
    """Configuration for the LLM client."""

    base_url: str
    api_key: str
    primary_model: str = "gpt-4o"
    fallback_models: list[str] = field(
        default_factory=lambda: ["gpt-4.1", "gpt-4o-mini"]
    )
    max_tokens: int = 4096
    temperature: float = 0.7
    max_retries: int = 3
    retry_base_delay: float = 2.0
    timeout_sec: int = 300
    user_agent: str = _DEFAULT_USER_AGENT
    # MetaClaw bridge: extra headers for proxy requests
    extra_headers: dict[str, str] = field(default_factory=dict)
    # MetaClaw bridge: fallback URL if primary (proxy) is unreachable
    fallback_url: str = ""
    fallback_api_key: str = ""


class LLMClient:
    """Stateless OpenAI-compatible chat completion client."""

    def __init__(self, config: LLMConfig) -> None:
        self.config = config
        self._model_chain = [config.primary_model] + list(config.fallback_models)
        self._anthropic = None  # Will be set by from_rc_config if needed

    @classmethod
    def from_rc_config(cls, rc_config: Any) -> LLMClient:
        from researchclaw.llm import PROVIDER_PRESETS

        provider = getattr(rc_config.llm, "provider", "openai")
        preset = PROVIDER_PRESETS.get(provider, {})
        preset_base_url = preset.get("base_url")

        api_key = str(
            rc_config.llm.api_key
            or os.environ.get(rc_config.llm.api_key_env, "")
            or ""
        )

        # Use preset base_url if available and config doesn't override
        base_url = rc_config.llm.base_url or preset_base_url or ""

        # Preserve original URL/key before MetaClaw bridge override
        # (needed for Anthropic adapter which should always talk directly
        # to the Anthropic API, not through the OpenAI-compatible proxy).
        original_base_url = base_url
        original_api_key = api_key

        # MetaClaw bridge: if enabled, point to proxy and set up fallback
        bridge = getattr(rc_config, "metaclaw_bridge", None)
        fallback_url = ""
        fallback_api_key = ""

        if bridge and getattr(bridge, "enabled", False):
            fallback_url = base_url
            fallback_api_key = api_key
            base_url = bridge.proxy_url
            if bridge.fallback_url:
                fallback_url = bridge.fallback_url
            if bridge.fallback_api_key:
                fallback_api_key = bridge.fallback_api_key

        config = LLMConfig(
            base_url=base_url,
            api_key=api_key,
            primary_model=rc_config.llm.primary_model or "gpt-4o",
            fallback_models=list(rc_config.llm.fallback_models or []),
            fallback_url=fallback_url,
            fallback_api_key=fallback_api_key,
        )
        client = cls(config)

        # Detect Anthropic or Kimi-Anthropic provider — use original URL/key (not the
        # MetaClaw proxy URL which is OpenAI-compatible only).
        if provider in ("anthropic", "kimi-anthropic"):
            from .anthropic_adapter import AnthropicAdapter

            client._anthropic = AnthropicAdapter(
                original_base_url, original_api_key, config.timeout_sec
            )
        return client

    def chat(
        self,
        messages: list[dict[str, str]],
        *,
        model: str | None = None,
        max_tokens: int | None = None,
        temperature: float | None = None,
        json_mode: bool = False,
        system: str | None = None,
        strip_thinking: bool = False,
    ) -> LLMResponse:
        """Send a chat completion request with retry and fallback.

        Args:
            messages: List of {role, content} dicts.
            model: Override model (skips fallback chain).
            max_tokens: Override max token count.
            temperature: Override temperature.
            json_mode: Request JSON response format.
            system: Prepend a system message.
            strip_thinking: If True, strip <think>…</think> reasoning
                tags from the response content.  Use this when the
                output will be written to paper/script artifacts but
                NOT for general chat calls (to avoid corrupting
                legitimate content).

        Returns:
            LLMResponse with content and metadata.
        """
        if system:
            messages = [{"role": "system", "content": system}] + messages

        models = [model] if model else self._model_chain
        max_tok = max_tokens or self.config.max_tokens
        temp = temperature if temperature is not None else self.config.temperature

        last_error: Exception | None = None

        for m in models:
            try:
                resp = self._call_with_retry(m, messages, max_tok, temp, json_mode)
                if strip_thinking:
                    from researchclaw.utils.thinking_tags import strip_thinking_tags
                    resp = LLMResponse(
                        content=strip_thinking_tags(resp.content),
                        model=resp.model,
                        prompt_tokens=resp.prompt_tokens,
                        completion_tokens=resp.completion_tokens,
                        total_tokens=resp.total_tokens,
                        finish_reason=resp.finish_reason,
                        truncated=resp.truncated,
                        raw=resp.raw,
                    )
                return resp
            except Exception as exc:  # noqa: BLE001
                logger.warning("Model %s failed: %s. Trying next.", m, exc)
                last_error = exc

        raise RuntimeError(
            f"All models failed. Last error: {last_error}"
        ) from last_error

    def preflight(self) -> tuple[bool, str]:
        """Quick connectivity check - one minimal chat call.

        Returns (success, message).
        Distinguishes: 401 (bad key), 403 (model forbidden),
                       404 (bad endpoint), 429 (rate limited), timeout.
        """
        is_reasoning = any(
            self.config.primary_model.startswith(p) for p in _NEW_PARAM_MODELS
        )
        min_tokens = 64 if is_reasoning else 1
        try:
            _ = self.chat(
                [{"role": "user", "content": "ping"}],
                max_tokens=min_tokens,
                temperature=0,
            )
            return True, f"OK - model {self.config.primary_model} responding"
        except urllib.error.HTTPError as e:
            status_map = {
                401: "Invalid API key",
                403: f"Model {self.config.primary_model} not allowed for this key",
                404: f"Endpoint not found: {self.config.base_url}",
                429: "Rate limited - try again in a moment",
            }
            msg = status_map.get(e.code, f"HTTP {e.code}")
            return False, msg
        except (urllib.error.URLError, OSError) as e:
            return False, f"Connection failed: {e}"
        except RuntimeError as e:
            # chat() wraps errors in RuntimeError; extract original HTTPError
            cause = e.__cause__
            if isinstance(cause, urllib.error.HTTPError):
                status_map = {
                    401: "Invalid API key",
                    403: f"Model {self.config.primary_model} not allowed for this key",
                    404: f"Endpoint not found: {self.config.base_url}",
                    429: "Rate limited - try again in a moment",
                }
                msg = status_map.get(cause.code, f"HTTP {cause.code}")
                return False, msg
            return False, f"All models failed: {e}"

    def _call_with_retry(
        self,
        model: str,
        messages: list[dict[str, str]],
        max_tokens: int,
        temperature: float,
        json_mode: bool,
    ) -> LLMResponse:
        """Call with exponential backoff retry."""
        for attempt in range(self.config.max_retries):
            try:
                return self._raw_call(
                    model, messages, max_tokens, temperature, json_mode
                )
            except urllib.error.HTTPError as e:
                status = e.code
                body = ""
                try:
                    body = e.read().decode()[:500]
                except Exception:  # noqa: BLE001
                    pass

                # Non-retryable errors
                if status == 403 and "not allowed to use model" in body:
                    raise  # Model not available — let fallback handle

                # 400 is normally non-retryable, but some providers
                # (Azure OpenAI) return 400 during overload / rate-limit.
                # Retry if the body hints at a transient issue.
                if status == 400:
                    _transient_400 = any(
                        kw in body.lower()
                        for kw in ("rate limit", "ratelimit", "overloaded",
                                   "temporarily", "capacity", "throttl",
                                   "too many", "retry")
                    )
                    if not _transient_400:
                        raise  # Genuine bad request — don't retry

                # Retryable: 429 (rate limit), transient 400, 500, 502, 503, 504,
                # 529 (Anthropic overloaded)
                if status in (400, 429, 500, 502, 503, 504, 529):
                    delay = self.config.retry_base_delay * (2**attempt)
                    # Add jitter
                    import random

                    delay += random.uniform(0, delay * 0.3)
                    logger.info(
                        "Retry %d/%d for %s (HTTP %d). Waiting %.1fs.",
                        attempt + 1,
                        self.config.max_retries,
                        model,
                        status,
                        delay,
                    )
                    time.sleep(delay)
                    continue

                raise  # Other HTTP errors
            except urllib.error.URLError:
                if attempt < self.config.max_retries - 1:
                    delay = self.config.retry_base_delay * (2**attempt)
                    time.sleep(delay)
                    continue
                raise

        # All retries exhausted
        raise RuntimeError(
            f"LLM call failed after {self.config.max_retries} retries for model {model}"
        )

    def _raw_call(
        self,
        model: str,
        messages: list[dict[str, str]],
        max_tokens: int,
        temperature: float,
        json_mode: bool,
    ) -> LLMResponse:
        """Make a single API call."""
        
        # Use Anthropic adapter if configured
        if self._anthropic:
            data = self._anthropic.chat_completion(model, messages, max_tokens, temperature, json_mode)
        else:
            # Original OpenAI logic
            # Copy messages to avoid mutating the caller's list (important for
            # retries and model-fallback — each attempt must start from the
            # original, un-modified messages).
            msgs = [dict(m) for m in messages]

            # MiniMax API requires temperature in [0, 1.0]
            _temp = temperature
            if "api.minimax.io" in self.config.base_url:
                _temp = max(0.0, min(_temp, 1.0))

            body: dict[str, Any] = {
                "model": model,
                "messages": msgs,
                "temperature": _temp,
            }

            # Use correct token parameter based on model
            if any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS):
                reasoning_min = 32768
                body["max_completion_tokens"] = max(max_tokens, reasoning_min)
            else:
                body["max_tokens"] = max_tokens

            if json_mode:
                # Many OpenAI-compatible proxies serving Claude models don't
                # support the response_format parameter and return HTTP 400.
                # Fall back to a system-prompt injection for non-OpenAI models.
                if model.startswith("claude"):
                    _json_hint = (
                        "You MUST respond with valid JSON only. "
                        "Do not include any text outside the JSON object."
                    )
                    # Prepend to existing system message or add as new one
                    if msgs and msgs[0]["role"] == "system":
                        msgs[0]["content"] = (
                            _json_hint + "\n\n" + msgs[0]["content"]
                        )
                    else:
                        msgs.insert(
                            0, {"role": "system", "content": _json_hint}
                        )
                else:
                    body["response_format"] = {"type": "json_object"}

            payload = json.dumps(body).encode("utf-8")
            url = f"{self.config.base_url.rstrip('/')}/chat/completions"

            headers = {
                "Authorization": f"Bearer {self.config.api_key}",
                "Content-Type": "application/json",
                "User-Agent": self.config.user_agent,
            }
            # MetaClaw bridge: inject extra headers (session ID, stage info, etc.)
            headers.update(self.config.extra_headers)

            req = urllib.request.Request(url, data=payload, headers=headers)

            try:
                with urllib.request.urlopen(req, timeout=self.config.timeout_sec) as resp:
                    data = json.loads(resp.read())
            except (urllib.error.URLError, OSError) as exc:
                # MetaClaw bridge: fallback to direct LLM if proxy unreachable
                if self.config.fallback_url:
                    logger.warning(
                        "Primary endpoint unreachable, falling back to %s: %s",
                        self.config.fallback_url,
                        exc,
                    )
                    fallback_url = (
                        f"{self.config.fallback_url.rstrip('/')}/chat/completions"
                    )
                    fallback_key = self.config.fallback_api_key or self.config.api_key
                    fallback_headers = {
                        "Authorization": f"Bearer {fallback_key}",
                        "Content-Type": "application/json",
                        "User-Agent": self.config.user_agent,
                    }
                    fallback_req = urllib.request.Request(
                        fallback_url, data=payload, headers=fallback_headers
                    )
                    with urllib.request.urlopen(
                        fallback_req, timeout=self.config.timeout_sec
                    ) as resp:
                        data = json.loads(resp.read())
                else:
                    raise

        # Handle API error responses
        if "error" in data:
            error_info = data["error"]
            error_msg = error_info.get("message", str(error_info))
            error_type = error_info.get("type", "api_error")
            import io
            raise urllib.error.HTTPError(
                "", 500, f"{error_type}: {error_msg}", {},
                io.BytesIO(error_msg.encode()),
            )

        # Validate response structure
        if "choices" not in data or not data["choices"]:
            raise ValueError(f"Malformed API response: missing choices. Got: {data}")

        choice = data["choices"][0]
        usage = data.get("usage", {})

        message = choice.get("message", {})
        content = message.get("content") or ""

        return LLMResponse(
            content=content,
            model=data.get("model", model),
            prompt_tokens=usage.get("prompt_tokens", 0),
            completion_tokens=usage.get("completion_tokens", 0),
            total_tokens=usage.get("total_tokens", 0),
            finish_reason=choice.get("finish_reason", ""),
            truncated=(choice.get("finish_reason", "") == "length"),
            raw=data,
        )


def create_client_from_yaml(yaml_path: str | None = None) -> LLMClient:
    """Create an LLMClient from the ARC config file.

    Reads base_url and api_key from config.arc.yaml's llm section.
    """
    import yaml as _yaml

    if yaml_path is None:
        yaml_path = "config.yaml"

    with open(yaml_path, encoding="utf-8") as f:
        raw = _yaml.safe_load(f)

    llm_section = raw.get("llm", {})
    api_key = str(
        os.environ.get(
            llm_section.get("api_key_env", "OPENAI_API_KEY"),
            llm_section.get("api_key", ""),
        )
        or ""
    )

    return LLMClient(
        LLMConfig(
            base_url=llm_section.get("base_url", "https://api.openai.com/v1"),
            api_key=api_key,
            primary_model=llm_section.get("primary_model", "gpt-4o"),
            fallback_models=llm_section.get(
                "fallback_models", ["gpt-4.1", "gpt-4o-mini"]
            ),
        )
    )


================================================
FILE: researchclaw/mcp/__init__.py
================================================
"""MCP (Model Context Protocol) standardized integration for AutoResearchClaw."""

from researchclaw.mcp.server import ResearchClawMCPServer
from researchclaw.mcp.client import MCPClient
from researchclaw.mcp.registry import MCPServerRegistry

__all__ = ["ResearchClawMCPServer", "MCPClient", "MCPServerRegistry"]


================================================
FILE: researchclaw/mcp/client.py
================================================
"""MCP Client: connect to external MCP servers for enhanced capabilities."""

from __future__ import annotations

import json
import logging
from typing import Any

logger = logging.getLogger(__name__)


class MCPClient:
    """Connect to an external MCP server and invoke its tools.

    Supports stdio and SSE transports. The actual protocol I/O
    is abstracted so we can add more transports later.
    """

    def __init__(self, server_uri: str, transport: str = "stdio") -> None:
        self.uri = server_uri
        self.transport = transport
        self._connected = False
        self._tools_cache: list[dict[str, Any]] | None = None

    # ── connection ────────────────────────────────────────────────

    async def connect(self) -> None:
        """Establish connection to the MCP server."""
        logger.info("Connecting to MCP server: %s (transport=%s)", self.uri, self.transport)
        self._connected = True

    async def disconnect(self) -> None:
        """Close the connection."""
        self._connected = False
        self._tools_cache = None

    @property
    def is_connected(self) -> bool:
        return self._connected

    # ── tool discovery ────────────────────────────────────────────

    async def list_tools(self) -> list[dict[str, Any]]:
        """List tools available on the remote MCP server."""
        if not self._connected:
            raise ConnectionError("Not connected to MCP server")
        if self._tools_cache is not None:
            return self._tools_cache

        response = await self._send_request("tools/list", {})
        tools = response.get("tools", [])
        self._tools_cache = tools
        return tools

    async def call_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]:
        """Call a tool on the remote MCP server."""
        if not self._connected:
            raise ConnectionError("Not connected to MCP server")
        return await self._send_request("tools/call", {"name": name, "arguments": arguments})

    # ── resource access ───────────────────────────────────────────

    async def list_resources(self) -> list[dict[str, Any]]:
        """List resources available on the remote MCP server."""
        if not self._connected:
            raise ConnectionError("Not connected to MCP server")
        response = await self._send_request("resources/list", {})
        return response.get("resources", [])

    async def read_resource(self, uri: str) -> str:
        """Read a resource from the remote MCP server."""
        if not self._connected:
            raise ConnectionError("Not connected to MCP server")
        response = await self._send_request("resources/read", {"uri": uri})
        contents = response.get("contents", [])
        if contents:
            return contents[0].get("text", "")
        return ""

    # ── transport layer ───────────────────────────────────────────

    async def _send_request(self, method: str, params: dict[str, Any]) -> dict[str, Any]:
        """Send a JSON-RPC request to the MCP server.

        This is a stub — real implementation delegates to transport.py.
        """
        message = {
            "jsonrpc": "2.0",
            "id": 1,
            "method": method,
            "params": params,
        }
        logger.debug("MCP request: %s", json.dumps(message, default=str)[:200])
        # Stub: return empty result
        return {"result": {}}


================================================
FILE: researchclaw/mcp/registry.py
================================================
"""Registry of connected MCP servers."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.mcp.client import MCPClient

logger = logging.getLogger(__name__)


class MCPServerRegistry:
    """Track connected external MCP servers."""

    def __init__(self) -> None:
        self._servers: dict[str, MCPClient] = {}

    async def register(self, name: str, uri: str, transport: str = "stdio") -> MCPClient:
        """Register and connect to an external MCP server."""
        client = MCPClient(uri, transport=transport)
        await client.connect()
        self._servers[name] = client
        logger.info("Registered MCP server: %s -> %s", name, uri)
        return client

    async def unregister(self, name: str) -> None:
        """Disconnect and remove an MCP server."""
        client = self._servers.pop(name, None)
        if client:
            await client.disconnect()

    def get(self, name: str) -> MCPClient | None:
        """Get a connected MCP client by name."""
        return self._servers.get(name)

    def list_all(self) -> list[dict[str, Any]]:
        """List all registered MCP servers."""
        return [
            {"name": name, "uri": client.uri, "connected": client.is_connected}
            for name, client in self._servers.items()
        ]

    async def close_all(self) -> None:
        """Disconnect from all servers."""
        for name in list(self._servers):
            await self.unregister(name)

    @property
    def count(self) -> int:
        return len(self._servers)


================================================
FILE: researchclaw/mcp/server.py
================================================
"""ResearchClaw MCP Server: expose pipeline capabilities to external agents."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.mcp.tools import TOOL_DEFINITIONS, list_tool_names

logger = logging.getLogger(__name__)


class ResearchClawMCPServer:
    """MCP Server that exposes AutoResearchClaw capabilities as tools.

    External agents (e.g., Claude, OpenClaw) can connect to this server
    and invoke pipeline operations via the MCP protocol.
    """

    def __init__(self, config: Any = None) -> None:
        self.config = config
        self._handlers: dict[str, Any] = {}
        self._running = False

    def get_tools(self) -> list[dict[str, Any]]:
        """Return the list of available MCP tools."""
        return TOOL_DEFINITIONS

    async def handle_tool_call(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]:
        """Handle an incoming MCP tool call."""
        if name not in list_tool_names():
            return {"error": f"Unknown tool: {name}", "success": False}

        logger.info("MCP tool call: %s(%s)", name, json.dumps(arguments, default=str)[:200])

        try:
            if name == "run_pipeline":
                return await self._handle_run_pipeline(arguments)
            elif name == "get_pipeline_status":
                return await self._handle_get_status(arguments)
            elif name == "get_experiment_results":
                return await self._handle_get_results(arguments)
            elif name == "search_literature":
                return await self._handle_search_literature(arguments)
            elif name == "review_paper":
                return await self._handle_review_paper(arguments)
            elif name == "get_paper":
                return await self._handle_get_paper(arguments)
            else:
                return {"error": f"Handler not implemented: {name}", "success": False}
        except Exception as exc:
            logger.error("MCP tool call %s failed: %s", name, exc)
            return {"error": str(exc), "success": False}

    async def _handle_run_pipeline(self, args: dict[str, Any]) -> dict[str, Any]:
        """Start a pipeline run."""
        topic = args["topic"]
        # In production, this would invoke the full pipeline asynchronously
        return {
            "success": True,
            "message": f"Pipeline started for topic: {topic}",
            "run_id": f"mcp-stub-{topic[:20]}",
        }

    async def _handle_get_status(self, args: dict[str, Any]) -> dict[str, Any]:
        """Get pipeline status."""
        run_id = args["run_id"]
        run_dir = Path(f"artifacts/{run_id}")
        if not run_dir.exists():
            return {"success": False, "error": f"Run not found: {run_id}"}
        # Read checkpoint if available
        checkpoint_file = run_dir / "checkpoint.json"
        if checkpoint_file.exists():
            data = json.loads(checkpoint_file.read_text(encoding="utf-8"))
            return {"success": True, "run_id": run_id, "checkpoint": data}
        return {"success": True, "run_id": run_id, "status": "no_checkpoint"}

    async def _handle_get_results(self, args: dict[str, Any]) -> dict[str, Any]:
        """Get experiment results."""
        run_id = args["run_id"]
        run_dir = Path(f"artifacts/{run_id}")
        results_file = run_dir / "experiment_results.json"
        if results_file.exists():
            data = json.loads(results_file.read_text(encoding="utf-8"))
            return {"success": True, "results": data}
        return {"success": False, "error": "No results found"}

    async def _handle_search_literature(self, args: dict[str, Any]) -> dict[str, Any]:
        """Search literature (stub — real implementation would use literature/ module)."""
        return {
            "success": True,
            "query": args["query"],
            "results": [],
            "message": "Literature search stub",
        }

    async def _handle_review_paper(self, args: dict[str, Any]) -> dict[str, Any]:
        """Review paper (stub)."""
        return {
            "success": True,
            "paper_path": args["paper_path"],
            "review": "Stub review — not yet implemented",
        }

    async def _handle_get_paper(self, args: dict[str, Any]) -> dict[str, Any]:
        """Get generated paper."""
        run_id = args["run_id"]
        fmt = args.get("format", "markdown")
        run_dir = Path(f"artifacts/{run_id}")
        if fmt == "latex":
            paper_file = run_dir / "paper.tex"
        else:
            paper_file = run_dir / "paper_draft.md"
        if paper_file.exists():
            return {"success": True, "content": paper_file.read_text(encoding="utf-8")}
        return {"success": False, "error": f"Paper not found in {run_dir}"}

    # ── server lifecycle ──────────────────────────────────────────

    async def start(self, transport: str = "stdio") -> None:
        """Start the MCP server (stdio or SSE transport)."""
        self._running = True
        logger.info("MCP server started (transport=%s)", transport)

    async def stop(self) -> None:
        """Stop the MCP server."""
        self._running = False
        logger.info("MCP server stopped")

    @property
    def is_running(self) -> bool:
        return self._running


================================================
FILE: researchclaw/mcp/tools.py
================================================
"""MCP tool definitions for ResearchClaw capabilities."""

from __future__ import annotations

from typing import Any

# Tool schemas exposed by the ResearchClaw MCP Server
TOOL_DEFINITIONS: list[dict[str, Any]] = [
    {
        "name": "run_pipeline",
        "description": "Start an autonomous research pipeline run on a given topic.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "topic": {"type": "string", "description": "The research topic"},
                "config_path": {"type": "string", "description": "Path to config YAML (optional)"},
                "auto_approve": {"type": "boolean", "description": "Auto-approve gate stages"},
            },
            "required": ["topic"],
        },
    },
    {
        "name": "get_pipeline_status",
        "description": "Get the current status of a pipeline run.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "run_id": {"type": "string", "description": "The pipeline run ID"},
            },
            "required": ["run_id"],
        },
    },
    {
        "name": "get_experiment_results",
        "description": "Get experiment results from a completed or running pipeline.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "run_id": {"type": "string", "description": "The pipeline run ID"},
                "stage": {"type": "string", "description": "Specific stage name (optional)"},
            },
            "required": ["run_id"],
        },
    },
    {
        "name": "search_literature",
        "description": "Search academic papers on a topic using Semantic Scholar and arXiv.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "Search query"},
                "max_results": {"type": "integer", "description": "Max results (default 10)"},
            },
            "required": ["query"],
        },
    },
    {
        "name": "review_paper",
        "description": "Run AI peer review on a paper draft.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "paper_path": {"type": "string", "description": "Path to paper markdown or LaTeX file"},
                "run_id": {"type": "string", "description": "Associated run ID (optional)"},
            },
            "required": ["paper_path"],
        },
    },
    {
        "name": "get_paper",
        "description": "Get the generated paper from a pipeline run.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "run_id": {"type": "string", "description": "The pipeline run ID"},
                "format": {"type": "string", "enum": ["markdown", "latex"], "description": "Output format"},
            },
            "required": ["run_id"],
        },
    },
]


def get_tool_schema(name: str) -> dict[str, Any] | None:
    """Get the schema for a specific tool by name."""
    for tool in TOOL_DEFINITIONS:
        if tool["name"] == name:
            return tool
    return None


def list_tool_names() -> list[str]:
    """Return all available tool names."""
    return [t["name"] for t in TOOL_DEFINITIONS]


================================================
FILE: researchclaw/mcp/transport.py
================================================
"""MCP transport layer: stdio and SSE implementations."""

from __future__ import annotations

import asyncio
import json
import logging
import sys
from typing import Any, Protocol

logger = logging.getLogger(__name__)


class MCPTransport(Protocol):
    """Protocol for MCP message transport."""

    async def send(self, message: dict[str, Any]) -> None: ...
    async def receive(self) -> dict[str, Any]: ...
    async def close(self) -> None: ...


class StdioTransport:
    """MCP transport over stdin/stdout (for CLI integration)."""

    def __init__(self) -> None:
        self._reader: asyncio.StreamReader | None = None
        self._writer: asyncio.StreamWriter | None = None

    async def start(self) -> None:
        """Initialize stdin/stdout streams for async I/O."""
        loop = asyncio.get_event_loop()
        self._reader = asyncio.StreamReader()
        protocol = asyncio.StreamReaderProtocol(self._reader)
        await loop.connect_read_pipe(lambda: protocol, sys.stdin)
        w_transport, w_protocol = await loop.connect_write_pipe(
            asyncio.streams.FlowControlMixin, sys.stdout
        )
        self._writer = asyncio.StreamWriter(w_transport, w_protocol, self._reader, loop)

    async def send(self, message: dict[str, Any]) -> None:
        """Write a JSON-RPC message to stdout."""
        if self._writer is None:
            raise RuntimeError("Transport not started")
        data = json.dumps(message, ensure_ascii=False)
        header = f"Content-Length: {len(data.encode())}\r\n\r\n"
        self._writer.write(header.encode() + data.encode())
        await self._writer.drain()

    async def receive(self) -> dict[str, Any]:
        """Read a JSON-RPC message from stdin."""
        if self._reader is None:
            raise RuntimeError("Transport not started")
        # Read headers
        content_length = 0
        while True:
            line = await self._reader.readline()
            decoded = line.decode().strip()
            if not decoded:
                break
            if decoded.lower().startswith("content-length:"):
                content_length = int(decoded.split(":")[1].strip())
        if content_length == 0:
            raise EOFError("No content-length header received")
        body = await self._reader.readexactly(content_length)
        return json.loads(body)

    async def close(self) -> None:
        """Close the transport."""
        if self._writer:
            self._writer.close()


class SSETransport:
    """MCP transport over Server-Sent Events (for web integration).

    This is a stub — a full implementation would use aiohttp or similar.
    """

    def __init__(self, host: str = "0.0.0.0", port: int = 3000) -> None:
        self.host = host
        self.port = port
        self._running = False

    async def start(self) -> None:
        """Start the SSE server."""
        self._running = True
        logger.info("SSE transport started on %s:%d", self.host, self.port)

    async def send(self, message: dict[str, Any]) -> None:
        """Send an SSE event (stub)."""
        logger.debug("SSE send: %s", json.dumps(message, default=str)[:200])

    async def receive(self) -> dict[str, Any]:
        """Receive from SSE (stub)."""
        raise NotImplementedError("SSE receive not yet implemented")

    async def close(self) -> None:
        """Stop the SSE server."""
        self._running = False


================================================
FILE: researchclaw/memory/__init__.py
================================================
"""Persistent evolutionary memory system for AutoResearchClaw.

Provides three categories of memory:
- **Ideation**: Research topics, hypotheses, and their outcomes.
- **Experiment**: Hyperparameters, architectures, and training tricks.
- **Writing**: Review feedback, paper structure patterns.

Each category supports semantic retrieval via embeddings, time-decay
weighting, and confidence scoring.
"""

from researchclaw.memory.store import MemoryEntry, MemoryStore
from researchclaw.memory.retriever import MemoryRetriever
from researchclaw.memory.decay import time_decay_weight, confidence_update

__all__ = [
    "MemoryEntry",
    "MemoryStore",
    "MemoryRetriever",
    "time_decay_weight",
    "confidence_update",
]


================================================
FILE: researchclaw/memory/decay.py
================================================
"""Time-decay and confidence scoring for memory entries."""

from __future__ import annotations

import math
from datetime import datetime, timezone


def time_decay_weight(
    created_at: datetime,
    half_life_days: float = 90.0,
    max_age_days: float = 365.0,
    *,
    now: datetime | None = None,
) -> float:
    """Compute exponential decay weight based on entry age.

    Args:
        created_at: When the memory was created.
        half_life_days: Half-life in days (weight = 0.5 after this many days).
        max_age_days: Entries older than this get weight 0.0.
        now: Current time (defaults to UTC now).

    Returns:
        Weight in [0.0, 1.0].
    """
    if now is None:
        now = datetime.now(timezone.utc)
    if created_at.tzinfo is None:
        created_at = created_at.replace(tzinfo=timezone.utc)
    if now.tzinfo is None:
        now = now.replace(tzinfo=timezone.utc)

    age_seconds = (now - created_at).total_seconds()
    age_days = age_seconds / 86400.0

    if age_days < 0:
        return 1.0
    if age_days > max_age_days:
        return 0.0

    return math.exp(-age_days * math.log(2) / half_life_days)


def confidence_update(
    current: float,
    delta: float,
    floor: float = 0.0,
    ceiling: float = 1.0,
) -> float:
    """Update confidence score with clamping.

    Args:
        current: Current confidence value.
        delta: Change amount (positive for success, negative for failure).
        floor: Minimum allowed value.
        ceiling: Maximum allowed value.

    Returns:
        Updated confidence clamped to [floor, ceiling].
    """
    return max(floor, min(ceiling, current + delta))


================================================
FILE: researchclaw/memory/embeddings.py
================================================
"""Vector embedding management for memory retrieval.

Supports three backends (auto-fallback):
1. OpenAI-compatible API embeddings (text-embedding-3-small)
2. sentence-transformers local model
3. TF-IDF bag-of-words (zero-dependency fallback)
"""

from __future__ import annotations

import hashlib
import json
import logging
import math
import re
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

# Dimension for TF-IDF fallback
_TFIDF_DIM = 256


def _tokenize(text: str) -> list[str]:
    """Simple whitespace + punctuation tokenizer."""
    return re.findall(r"[a-z0-9]+", text.lower())


def _hash_token(token: str, dim: int = _TFIDF_DIM) -> int:
    """Hash a token to a dimension index."""
    h = hashlib.md5(token.encode(), usedforsecurity=False).hexdigest()
    return int(h, 16) % dim


class EmbeddingProvider:
    """Manages embedding generation with automatic fallback."""

    def __init__(
        self,
        model: str = "text-embedding-3-small",
        api_base_url: str = "",
        api_key: str = "",
        cache_dir: Path | None = None,
    ) -> None:
        self._model = model
        self._api_base_url = api_base_url
        self._api_key = api_key
        self._cache_dir = cache_dir
        self._backend: str | None = None
        self._dim: int = _TFIDF_DIM
        self._local_model: Any = None

    @property
    def backend(self) -> str:
        """Return the active backend name."""
        if self._backend is None:
            self._detect_backend()
        return self._backend  # type: ignore[return-value]

    @property
    def dimension(self) -> int:
        """Return embedding dimensionality."""
        if self._backend is None:
            self._detect_backend()
        return self._dim

    def _detect_backend(self) -> None:
        """Auto-detect the best available embedding backend."""
        # 1. Try OpenAI API
        if self._api_base_url and self._api_key:
            self._backend = "api"
            self._dim = 1536  # text-embedding-3-small default
            logger.info("Embedding backend: OpenAI API (%s)", self._model)
            return

        # 2. Try sentence-transformers
        try:
            from sentence_transformers import SentenceTransformer  # type: ignore[import-untyped]

            self._local_model = SentenceTransformer("all-MiniLM-L6-v2")
            self._backend = "sentence_transformers"
            self._dim = 384
            logger.info("Embedding backend: sentence-transformers (all-MiniLM-L6-v2)")
            return
        except ImportError:
            pass

        # 3. Fallback to TF-IDF
        self._backend = "tfidf"
        self._dim = _TFIDF_DIM
        logger.info("Embedding backend: TF-IDF fallback (dim=%d)", self._dim)

    def embed(self, text: str) -> list[float]:
        """Generate embedding vector for a text string.

        Args:
            text: Input text to embed.

        Returns:
            List of floats representing the embedding vector.
        """
        if self._backend is None:
            self._detect_backend()

        if self._backend == "api":
            return self._embed_api(text)
        elif self._backend == "sentence_transformers":
            return self._embed_local(text)
        else:
            return self._embed_tfidf(text)

    def embed_batch(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for multiple texts.

        Args:
            texts: List of input texts.

        Returns:
            List of embedding vectors.
        """
        if self._backend is None:
            self._detect_backend()

        if self._backend == "sentence_transformers" and self._local_model is not None:
            embeddings = self._local_model.encode(texts)
            return [e.tolist() for e in embeddings]

        return [self.embed(t) for t in texts]

    def _embed_api(self, text: str) -> list[float]:
        """Get embedding from OpenAI-compatible API."""
        import urllib.request

        url = f"{self._api_base_url.rstrip('/')}/embeddings"
        payload = json.dumps({
            "input": text[:8000],
            "model": self._model,
        }).encode()

        req = urllib.request.Request(
            url,
            data=payload,
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self._api_key}",
            },
        )
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                data = json.loads(resp.read())
                return data["data"][0]["embedding"]
        except Exception as exc:
            logger.warning("API embedding failed, falling back to TF-IDF: %s", exc)
            return self._embed_tfidf(text)

    def _embed_local(self, text: str) -> list[float]:
        """Get embedding from local sentence-transformers model."""
        if self._local_model is None:
            return self._embed_tfidf(text)
        embedding = self._local_model.encode(text)
        return embedding.tolist()

    def _embed_tfidf(self, text: str) -> list[float]:
        """Generate TF-IDF-style bag-of-words embedding (zero-dependency fallback)."""
        tokens = _tokenize(text)
        if not tokens:
            return [0.0] * self._dim

        vec = [0.0] * _TFIDF_DIM
        for token in tokens:
            idx = _hash_token(token, _TFIDF_DIM)
            vec[idx] += 1.0

        # L2 normalize
        norm = math.sqrt(sum(v * v for v in vec))
        if norm > 0:
            vec = [v / norm for v in vec]

        return vec


================================================
FILE: researchclaw/memory/experiment_memory.py
================================================
"""Experiment memory — records and retrieves experiment experiences."""

from __future__ import annotations

import json
import logging
from typing import Any

from researchclaw.memory.retriever import MemoryRetriever
from researchclaw.memory.store import MemoryStore

logger = logging.getLogger(__name__)

CATEGORY = "experiment"


class ExperimentMemory:
    """Records and retrieves experiment experiences.

    Tracks hyperparameter configurations, model architectures, and
    training tricks that worked (or failed) in past runs.
    """

    def __init__(
        self,
        store: MemoryStore,
        retriever: MemoryRetriever,
        embed_fn: Any = None,
    ) -> None:
        self._store = store
        self._retriever = retriever
        self._embed_fn = embed_fn

    def record_hyperparams(
        self,
        task_type: str,
        hyperparams: dict[str, Any],
        metric: float,
        metric_name: str = "primary_metric",
        run_id: str = "",
    ) -> str:
        """Record an effective hyperparameter configuration.

        Args:
            task_type: Type of task (e.g., "image_classification").
            hyperparams: Dict of hyperparameter values.
            metric: Achieved metric value.
            metric_name: Name of the metric.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        hp_str = json.dumps(hyperparams, indent=2, default=str)
        content = (
            f"Task: {task_type}\n"
            f"Hyperparameters:\n{hp_str}\n"
            f"Result: {metric_name}={metric:.4f}"
        )
        metadata = {
            "type": "hyperparams",
            "task_type": task_type,
            "hyperparams": hyperparams,
            "metric": metric,
            "metric_name": metric_name,
            "run_id": run_id,
        }

        # Higher metric → higher confidence (assuming maximize)
        confidence = min(1.0, 0.4 + metric * 0.5)
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def record_architecture(
        self,
        task_type: str,
        architecture: str,
        metric: float,
        run_id: str = "",
    ) -> str:
        """Record a successful model architecture.

        Args:
            task_type: Type of task.
            architecture: Architecture description.
            metric: Achieved metric value.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        content = (
            f"Task: {task_type}\n"
            f"Architecture: {architecture}\n"
            f"Performance: {metric:.4f}"
        )
        metadata = {
            "type": "architecture",
            "task_type": task_type,
            "architecture": architecture,
            "metric": metric,
            "run_id": run_id,
        }

        confidence = min(1.0, 0.4 + metric * 0.5)
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def record_training_trick(
        self,
        trick: str,
        improvement: float,
        context: str,
        run_id: str = "",
    ) -> str:
        """Record an effective training trick.

        Args:
            trick: Description of the trick.
            improvement: Relative improvement (e.g., 0.05 for 5%).
            context: When/where the trick was applied.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        content = (
            f"Trick: {trick}\n"
            f"Improvement: {improvement:+.1%}\n"
            f"Context: {context}"
        )
        metadata = {
            "type": "training_trick",
            "trick": trick,
            "improvement": improvement,
            "context": context,
            "run_id": run_id,
        }

        confidence = 0.6 if improvement > 0 else 0.3
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def recall_best_configs(
        self,
        task_type: str,
        top_k: int = 3,
    ) -> str:
        """Retrieve best configurations for a task type.

        Args:
            task_type: Description of the current task.
            top_k: Number of results.

        Returns:
            Formatted string of best configurations.
        """
        query = f"best hyperparameters and architecture for {task_type}"
        results = self._retriever.recall_by_text(
            query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn
        )
        if not results:
            return ""

        parts = ["### Best Experiment Configurations (from memory)"]
        for i, (entry, score) in enumerate(results, 1):
            metric = entry.metadata.get("metric", "?")
            parts.append(
                f"{i}. {entry.content.splitlines()[0]} "
                f"(metric: {metric}, relevance: {score:.2f})"
            )
            # Include hyperparams detail if available
            hp = entry.metadata.get("hyperparams")
            if hp:
                parts.append(f"   Config: {json.dumps(hp, default=str)}")
        return "\n".join(parts)


================================================
FILE: researchclaw/memory/ideation_memory.py
================================================
"""Ideation memory — records and retrieves research direction experiences."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.memory.retriever import MemoryRetriever
from researchclaw.memory.store import MemoryStore

logger = logging.getLogger(__name__)

CATEGORY = "ideation"


class IdeationMemory:
    """Records and retrieves research direction experiences.

    Tracks which topics succeeded or failed, which hypotheses were
    feasible, and builds up anti-patterns to avoid in the future.
    """

    def __init__(
        self,
        store: MemoryStore,
        retriever: MemoryRetriever,
        embed_fn: Any = None,
    ) -> None:
        self._store = store
        self._retriever = retriever
        self._embed_fn = embed_fn

    def record_topic_outcome(
        self,
        topic: str,
        outcome: str,
        quality_score: float,
        run_id: str = "",
    ) -> str:
        """Record the outcome of a research topic.

        Args:
            topic: The research topic description.
            outcome: One of "success", "failure", "abandoned".
            quality_score: Quality score (0-10).
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        content = (
            f"Topic: {topic}\n"
            f"Outcome: {outcome}\n"
            f"Quality: {quality_score:.1f}/10"
        )
        metadata = {
            "type": "topic_outcome",
            "outcome": outcome,
            "quality_score": quality_score,
            "run_id": run_id,
        }

        # Higher quality → higher confidence
        confidence = min(1.0, 0.3 + quality_score / 15.0)
        if outcome == "failure":
            confidence = max(0.5, confidence)  # failures are valuable too

        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def record_hypothesis(
        self,
        hypothesis: str,
        feasible: bool,
        reason: str,
        run_id: str = "",
    ) -> str:
        """Record a hypothesis feasibility assessment.

        Args:
            hypothesis: The hypothesis text.
            feasible: Whether it was feasible.
            reason: Reason for the assessment.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        outcome = "feasible" if feasible else "infeasible"
        content = (
            f"Hypothesis: {hypothesis}\n"
            f"Assessment: {outcome}\n"
            f"Reason: {reason}"
        )
        metadata = {
            "type": "hypothesis",
            "feasible": feasible,
            "run_id": run_id,
        }

        confidence = 0.6 if feasible else 0.7  # infeasible is more informative
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def recall_similar_topics(
        self,
        query: str,
        top_k: int = 5,
    ) -> str:
        """Retrieve similar historical research directions with outcomes.

        Args:
            query: Current research topic or query.
            top_k: Number of results to return.

        Returns:
            Formatted string of similar past topics and their outcomes.
        """
        results = self._retriever.recall_by_text(
            query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn
        )
        if not results:
            return ""

        parts = ["### Past Research Directions (from memory)"]
        for i, (entry, score) in enumerate(results, 1):
            outcome = entry.metadata.get("outcome", "unknown")
            quality = entry.metadata.get("quality_score", "?")
            icon = {"success": "+", "failure": "-", "abandoned": "~"}.get(
                outcome, "?"
            )
            parts.append(
                f"{i}. [{icon}] {entry.content.splitlines()[0]} "
                f"(score: {quality}, relevance: {score:.2f})"
            )
        return "\n".join(parts)

    def get_anti_patterns(self) -> list[str]:
        """Get known failure patterns to avoid.

        Returns:
            List of topic descriptions that previously failed.
        """
        entries = self._store.get_all(CATEGORY)
        failures: list[str] = []
        for entry in entries:
            if entry.metadata.get("outcome") == "failure":
                topic_line = entry.content.splitlines()[0]
                reason = entry.metadata.get("reason", "")
                msg = topic_line
                if reason:
                    msg += f" — {reason}"
                failures.append(msg)
        return failures


================================================
FILE: researchclaw/memory/retriever.py
================================================
"""Similarity-based memory retrieval engine.

Combines cosine similarity with time-decay and confidence weighting
to return the most relevant memory entries for a given query.
"""

from __future__ import annotations

import logging
import math
from datetime import datetime, timezone
from typing import Any

from researchclaw.memory.decay import time_decay_weight
from researchclaw.memory.store import MemoryEntry, MemoryStore

logger = logging.getLogger(__name__)


def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Compute cosine similarity between two vectors.

    Args:
        a: First vector.
        b: Second vector.

    Returns:
        Cosine similarity in [-1, 1], or 0.0 if either vector is zero.
    """
    if len(a) != len(b) or not a:
        return 0.0

    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))

    if norm_a == 0.0 or norm_b == 0.0:
        return 0.0

    return dot / (norm_a * norm_b)


class MemoryRetriever:
    """Retrieves relevant memories using semantic similarity.

    Scoring formula:
        score = sim_weight * cosine_sim
              + decay_weight * time_decay
              + conf_weight * confidence
              + access_weight * normalized_access_count
    """

    def __init__(
        self,
        store: MemoryStore,
        half_life_days: float = 90.0,
        sim_weight: float = 0.5,
        decay_weight: float = 0.2,
        conf_weight: float = 0.2,
        access_weight: float = 0.1,
    ) -> None:
        self._store = store
        self._half_life_days = half_life_days
        self._sim_weight = sim_weight
        self._decay_weight = decay_weight
        self._conf_weight = conf_weight
        self._access_weight = access_weight

    def recall(
        self,
        query_embedding: list[float],
        category: str | None = None,
        top_k: int = 5,
        min_score: float = 0.0,
    ) -> list[tuple[MemoryEntry, float]]:
        """Retrieve most relevant memories for a query embedding.

        Args:
            query_embedding: Query vector.
            category: Filter by category (None for all).
            top_k: Maximum number of results.
            min_score: Minimum composite score threshold.

        Returns:
            List of (entry, score) tuples sorted by relevance.
        """
        entries = self._store.get_all(category)
        if not entries:
            return []

        # Find max access count for normalization
        max_access = max((e.access_count for e in entries), default=1)
        if max_access == 0:
            max_access = 1

        scored: list[tuple[MemoryEntry, float]] = []
        now = datetime.now(timezone.utc)

        for entry in entries:
            # Cosine similarity
            sim = cosine_similarity(query_embedding, entry.embedding)

            # Time decay
            try:
                created = datetime.fromisoformat(entry.created_at)
            except (ValueError, TypeError):
                created = now
            decay = time_decay_weight(
                created, half_life_days=self._half_life_days, now=now
            )

            # Normalized access frequency
            norm_access = entry.access_count / max_access

            # Composite score
            score = (
                self._sim_weight * sim
                + self._decay_weight * decay
                + self._conf_weight * entry.confidence
                + self._access_weight * norm_access
            )

            if score >= min_score:
                scored.append((entry, score))

        scored.sort(key=lambda x: x[1], reverse=True)

        # Mark top results as accessed
        for entry, _ in scored[:top_k]:
            self._store.mark_accessed(entry.id)

        return scored[:top_k]

    def recall_by_text(
        self,
        query: str,
        category: str | None = None,
        top_k: int = 5,
        embed_fn: Any = None,
    ) -> list[tuple[MemoryEntry, float]]:
        """Retrieve memories using text query (requires embed function).

        Args:
            query: Text query string.
            category: Filter by category.
            top_k: Maximum results.
            embed_fn: Callable that converts text to embedding vector.

        Returns:
            List of (entry, score) tuples.
        """
        if embed_fn is None:
            logger.warning("No embedding function provided for text recall")
            return []

        query_embedding = embed_fn(query)
        return self.recall(query_embedding, category=category, top_k=top_k)

    def format_for_prompt(
        self,
        results: list[tuple[MemoryEntry, float]],
        max_chars: int = 3000,
    ) -> str:
        """Format retrieval results as prompt injection text.

        Args:
            results: List of (entry, score) tuples from recall().
            max_chars: Maximum character count for output.

        Returns:
            Formatted string suitable for LLM prompt injection.
        """
        if not results:
            return ""

        parts: list[str] = []
        total_len = 0

        for i, (entry, score) in enumerate(results, 1):
            line = f"{i}. [{entry.category}] (relevance: {score:.2f}) {entry.content}"
            if total_len + len(line) > max_chars:
                break
            parts.append(line)
            total_len += len(line)

        return "\n".join(parts)


================================================
FILE: researchclaw/memory/store.py
================================================
"""Unified memory storage engine.

Manages three categories of memory (ideation, experiment, writing) with
JSONL persistence, vector embeddings for semantic retrieval, time-decay
weighting, and confidence scoring.
"""

from __future__ import annotations

import json
import logging
import uuid
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class MemoryEntry:
    """A single memory record."""

    id: str
    category: str  # "ideation" | "experiment" | "writing"
    content: str
    metadata: dict[str, Any]
    embedding: list[float]
    confidence: float
    created_at: str  # ISO 8601
    last_accessed: str  # ISO 8601
    access_count: int

    def to_dict(self) -> dict[str, Any]:
        """Serialize to dictionary."""
        return asdict(self)

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> MemoryEntry:
        """Deserialize from dictionary."""
        return cls(
            id=str(data.get("id", "")),
            category=str(data.get("category", "")),
            content=str(data.get("content", "")),
            metadata=data.get("metadata") or {},
            embedding=data.get("embedding") or [],
            confidence=float(data.get("confidence", 0.5)),
            created_at=str(data.get("created_at", "")),
            last_accessed=str(data.get("last_accessed", "")),
            access_count=int(data.get("access_count", 0)),
        )


VALID_CATEGORIES = ("ideation", "experiment", "writing")


class MemoryStore:
    """JSONL-backed persistent memory storage.

    Stores MemoryEntry records organized by category, supporting
    add/recall/update/prune/save/load operations.
    """

    def __init__(
        self,
        store_dir: str | Path,
        max_entries_per_category: int = 500,
        confidence_threshold: float = 0.3,
    ) -> None:
        self._store_dir = Path(store_dir)
        self._max_per_category = max_entries_per_category
        self._confidence_threshold = confidence_threshold
        self._entries: dict[str, list[MemoryEntry]] = {
            cat: [] for cat in VALID_CATEGORIES
        }
        self._dirty = False

    @property
    def store_dir(self) -> Path:
        """Return the storage directory path."""
        return self._store_dir

    def add(
        self,
        category: str,
        content: str,
        metadata: dict[str, Any] | None = None,
        embedding: list[float] | None = None,
        confidence: float = 0.5,
    ) -> str:
        """Add a new memory entry.

        Args:
            category: One of "ideation", "experiment", "writing".
            content: The memory content text.
            metadata: Optional metadata dict (run_id, stage, topic, etc.).
            embedding: Pre-computed embedding vector (or empty).
            confidence: Initial confidence score (0-1).

        Returns:
            The generated entry ID.

        Raises:
            ValueError: If category is invalid.
        """
        if category not in VALID_CATEGORIES:
            raise ValueError(
                f"Invalid category '{category}'. Must be one of {VALID_CATEGORIES}"
            )

        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        entry_id = uuid.uuid4().hex[:12]

        entry = MemoryEntry(
            id=entry_id,
            category=category,
            content=content,
            metadata=metadata or {},
            embedding=embedding or [],
            confidence=confidence,
            created_at=now,
            last_accessed=now,
            access_count=0,
        )

        self._entries[category].append(entry)
        self._dirty = True

        # Enforce capacity limit (remove lowest confidence)
        entries = self._entries[category]
        if len(entries) > self._max_per_category:
            entries.sort(key=lambda e: e.confidence, reverse=True)
            self._entries[category] = entries[: self._max_per_category]

        return entry_id

    def get(self, entry_id: str) -> MemoryEntry | None:
        """Retrieve a single entry by ID."""
        for entries in self._entries.values():
            for entry in entries:
                if entry.id == entry_id:
                    return entry
        return None

    def get_all(self, category: str | None = None) -> list[MemoryEntry]:
        """Return all entries, optionally filtered by category."""
        if category:
            return list(self._entries.get(category, []))
        result: list[MemoryEntry] = []
        for entries in self._entries.values():
            result.extend(entries)
        return result

    def update_confidence(self, entry_id: str, delta: float) -> bool:
        """Update the confidence score of an entry.

        Args:
            entry_id: The entry to update.
            delta: Change amount (+0.1 for success, -0.2 for failure).

        Returns:
            True if entry was found and updated, False otherwise.
        """
        for entries in self._entries.values():
            for i, entry in enumerate(entries):
                if entry.id == entry_id:
                    new_conf = max(0.0, min(1.0, entry.confidence + delta))
                    # Replace with updated entry (frozen-like pattern)
                    entries[i] = MemoryEntry(
                        id=entry.id,
                        category=entry.category,
                        content=entry.content,
                        metadata=entry.metadata,
                        embedding=entry.embedding,
                        confidence=new_conf,
                        created_at=entry.created_at,
                        last_accessed=entry.last_accessed,
                        access_count=entry.access_count,
                    )
                    self._dirty = True
                    return True
        return False

    def mark_accessed(self, entry_id: str) -> bool:
        """Update last_accessed timestamp and increment access count."""
        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        for entries in self._entries.values():
            for i, entry in enumerate(entries):
                if entry.id == entry_id:
                    entries[i] = MemoryEntry(
                        id=entry.id,
                        category=entry.category,
                        content=entry.content,
                        metadata=entry.metadata,
                        embedding=entry.embedding,
                        confidence=entry.confidence,
                        created_at=entry.created_at,
                        last_accessed=now,
                        access_count=entry.access_count + 1,
                    )
                    self._dirty = True
                    return True
        return False

    def prune(
        self,
        confidence_threshold: float | None = None,
        max_age_days: float = 365.0,
    ) -> int:
        """Remove expired and low-confidence entries.

        Args:
            confidence_threshold: Minimum confidence (default from init).
            max_age_days: Maximum age in days.

        Returns:
            Number of entries removed.
        """
        threshold = confidence_threshold if confidence_threshold is not None else self._confidence_threshold
        now = datetime.now(timezone.utc)
        removed = 0

        for category in VALID_CATEGORIES:
            before = len(self._entries[category])
            kept: list[MemoryEntry] = []
            for entry in self._entries[category]:
                try:
                    created = datetime.fromisoformat(entry.created_at)
                    if created.tzinfo is None:
                        created = created.replace(tzinfo=timezone.utc)
                    age_days = (now - created).total_seconds() / 86400.0
                except (ValueError, TypeError):
                    age_days = 0.0

                if entry.confidence >= threshold and age_days <= max_age_days:
                    kept.append(entry)

            self._entries[category] = kept
            removed += before - len(kept)

        if removed > 0:
            self._dirty = True
            logger.info("Pruned %d memory entries", removed)

        return removed

    def save(self) -> None:
        """Persist all entries to disk in JSONL format."""
        self._store_dir.mkdir(parents=True, exist_ok=True)

        for category in VALID_CATEGORIES:
            path = self._store_dir / f"{category}.jsonl"
            with path.open("w", encoding="utf-8") as f:
                for entry in self._entries[category]:
                    f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n")

        self._dirty = False
        total = sum(len(v) for v in self._entries.values())
        logger.info("Saved %d memory entries to %s", total, self._store_dir)

    def load(self) -> int:
        """Load entries from disk.

        Returns:
            Total number of entries loaded.
        """
        total = 0
        for category in VALID_CATEGORIES:
            path = self._store_dir / f"{category}.jsonl"
            if not path.exists():
                continue
            entries: list[MemoryEntry] = []
            for line in path.read_text(encoding="utf-8").splitlines():
                line = line.strip()
                if not line:
                    continue
                try:
                    data = json.loads(line)
                    entries.append(MemoryEntry.from_dict(data))
                except (json.JSONDecodeError, TypeError) as exc:
                    logger.warning("Skipping malformed memory entry: %s", exc)
                    continue
            self._entries[category] = entries
            total += len(entries)

        logger.info("Loaded %d memory entries from %s", total, self._store_dir)
        return total

    def count(self, category: str | None = None) -> int:
        """Return total entries, optionally filtered by category."""
        if category:
            return len(self._entries.get(category, []))
        return sum(len(v) for v in self._entries.values())


================================================
FILE: researchclaw/memory/writing_memory.py
================================================
"""Writing memory — records and retrieves writing experiences."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.memory.retriever import MemoryRetriever
from researchclaw.memory.store import MemoryStore

logger = logging.getLogger(__name__)

CATEGORY = "writing"


class WritingMemory:
    """Records and retrieves writing experiences.

    Tracks review feedback and resolutions, successful paper structure
    patterns, and writing tips that improved paper quality.
    """

    def __init__(
        self,
        store: MemoryStore,
        retriever: MemoryRetriever,
        embed_fn: Any = None,
    ) -> None:
        self._store = store
        self._retriever = retriever
        self._embed_fn = embed_fn

    def record_review_feedback(
        self,
        feedback_type: str,
        feedback: str,
        resolution: str,
        run_id: str = "",
    ) -> str:
        """Record review feedback and its resolution.

        Args:
            feedback_type: Type of feedback (e.g., "clarity", "novelty", "methodology").
            feedback: The reviewer's feedback text.
            resolution: How the feedback was addressed.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        content = (
            f"Feedback type: {feedback_type}\n"
            f"Issue: {feedback}\n"
            f"Resolution: {resolution}"
        )
        metadata = {
            "type": "review_feedback",
            "feedback_type": feedback_type,
            "run_id": run_id,
        }

        confidence = 0.7  # review feedback is usually reliable
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def record_successful_structure(
        self,
        section: str,
        structure: str,
        score: float,
        run_id: str = "",
    ) -> str:
        """Record a high-scoring paper structure pattern.

        Args:
            section: Paper section (e.g., "introduction", "method").
            structure: Description of the structure pattern.
            score: Quality score achieved.
            run_id: Pipeline run identifier.

        Returns:
            The generated memory entry ID.
        """
        content = (
            f"Section: {section}\n"
            f"Structure: {structure}\n"
            f"Quality score: {score:.1f}/10"
        )
        metadata = {
            "type": "structure_pattern",
            "section": section,
            "score": score,
            "run_id": run_id,
        }

        confidence = min(1.0, 0.3 + score / 15.0)
        embedding = self._embed_fn(content) if self._embed_fn else []
        return self._store.add(
            CATEGORY, content, metadata, embedding, confidence
        )

    def recall_writing_tips(
        self,
        section: str,
        context: str,
        top_k: int = 5,
    ) -> str:
        """Retrieve writing tips relevant to the current task.

        Args:
            section: Current paper section being written.
            context: Additional context (topic, methodology, etc.).
            top_k: Number of results.

        Returns:
            Formatted string of writing tips.
        """
        query = f"writing tips for {section} section: {context}"
        results = self._retriever.recall_by_text(
            query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn
        )
        if not results:
            return ""

        parts = ["### Writing Experience (from memory)"]
        for i, (entry, score) in enumerate(results, 1):
            entry_type = entry.metadata.get("type", "tip")
            parts.append(
                f"{i}. [{entry_type}] {entry.content.splitlines()[0]} "
                f"(relevance: {score:.2f})"
            )
            # Show resolution for review feedback
            if entry_type == "review_feedback":
                lines = entry.content.splitlines()
                for line in lines:
                    if line.startswith("Resolution:"):
                        parts.append(f"   {line}")
                        break
        return "\n".join(parts)


================================================
FILE: researchclaw/metaclaw_bridge/__init__.py
================================================
"""MetaClaw integration bridge for AutoResearchClaw.

Provides skill injection, evolution bridging, PRM quality gates,
and session lifecycle management via the MetaClaw proxy.
"""

from researchclaw.metaclaw_bridge.config import MetaClawBridgeConfig

__all__ = ["MetaClawBridgeConfig"]


================================================
FILE: researchclaw/metaclaw_bridge/config.py
================================================
"""Configuration for the MetaClaw integration bridge."""

from __future__ import annotations

from dataclasses import dataclass, field


@dataclass(frozen=True)
class PRMConfig:
    """PRM (Process Reward Model) quality gate settings."""

    enabled: bool = False
    api_base: str = ""
    api_key_env: str = ""
    api_key: str = ""
    model: str = "gpt-5.4"
    votes: int = 3
    temperature: float = 0.6
    gate_stages: tuple[int, ...] = (5, 9, 15, 20)


@dataclass(frozen=True)
class LessonToSkillConfig:
    """Settings for converting AutoResearchClaw lessons into MetaClaw skills."""

    enabled: bool = True
    min_severity: str = "error"
    max_skills_per_run: int = 3


@dataclass(frozen=True)
class MetaClawBridgeConfig:
    """Top-level MetaClaw bridge configuration."""

    enabled: bool = False
    proxy_url: str = "http://localhost:30000"
    skills_dir: str = "~/.metaclaw/skills"
    fallback_url: str = ""  # Direct LLM URL if MetaClaw proxy is down
    fallback_api_key: str = ""
    prm: PRMConfig = field(default_factory=PRMConfig)
    lesson_to_skill: LessonToSkillConfig = field(default_factory=LessonToSkillConfig)


================================================
FILE: researchclaw/metaclaw_bridge/lesson_to_skill.py
================================================
"""Convert AutoResearchClaw failure lessons into MetaClaw skills.

Analyses high-severity lessons from the evolution store and uses an LLM
to generate actionable MetaClaw skill files that prevent future recurrence.
"""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import TYPE_CHECKING

from researchclaw.metaclaw_bridge.stage_skill_map import (
    LESSON_CATEGORY_TO_SKILL_CATEGORY,
)

if TYPE_CHECKING:
    from researchclaw.evolution import LessonEntry
    from researchclaw.llm.client import LLMClient

logger = logging.getLogger(__name__)

_SEVERITY_ORDER = {"info": 0, "warning": 1, "error": 2, "critical": 3}

_CONVERSION_PROMPT_SYSTEM = """\
You are a skill designer for an AI agent system. Your job is to convert
failure lessons from an automated research pipeline into reusable skill
guides that help the agent avoid the same mistakes in the future.

Each skill must include:
- A descriptive name (lowercase-hyphenated, prefixed with "arc-")
- A one-line description of when to use the skill
- A category from: {categories}
- Markdown content with numbered steps and an anti-pattern section

Output a JSON array of skill objects. Each object has:
  "name": "arc-<slug>",
  "description": "<when to use>",
  "category": "<category>",
  "content": "<markdown body>"
"""

_CONVERSION_PROMPT_USER = """\
The following failure lessons were extracted from automated research runs.
Please generate {max_skills} reusable skills to address these failures.

## Failure Lessons

{lessons_text}

## Existing Skills (do not duplicate)

{existing_skills}

Return ONLY a JSON array. No extra text.
"""


def _format_lessons(lessons: list[LessonEntry]) -> str:
    """Format lessons into a text block for the LLM prompt."""
    parts: list[str] = []
    for i, lesson in enumerate(lessons, 1):
        parts.append(
            f"{i}. [{lesson.severity}] [{lesson.category}] "
            f"Stage {lesson.stage_name}: {lesson.description}"
        )
    return "\n".join(parts)


def _list_existing_skill_names(skills_dir: Path) -> list[str]:
    """List all existing skill names in the MetaClaw skills directory."""
    try:
        if not skills_dir.exists():
            return []
        return [d.name for d in skills_dir.iterdir() if d.is_dir()]
    except OSError:
        return []


def _parse_skills_response(text: str) -> list[dict[str, str]]:
    """Parse the LLM response into a list of skill dicts."""
    # Strip markdown code fences if present
    text = text.strip()
    if text.startswith("```"):
        text = re.sub(r"^```\w*\n?", "", text)
        text = re.sub(r"\n?```\s*$", "", text)
    try:
        data = json.loads(text)
        # Handle both bare array and {"skills": [...]} wrapper
        if isinstance(data, dict):
            for key in ("skills", "results", "data"):
                if key in data and isinstance(data[key], list):
                    data = data[key]
                    break
        if isinstance(data, list):
            return [
                s
                for s in data
                if isinstance(s, dict)
                and all(k in s for k in ("name", "description", "category", "content"))
            ]
    except json.JSONDecodeError:
        logger.warning("Failed to parse skill evolution response as JSON")
    return []


def _write_skill(skills_dir: Path, skill: dict[str, str]) -> Path | None:
    """Write a single skill to disk as a SKILL.md file."""
    name = skill["name"]
    # Sanitize name
    name = re.sub(r"[^a-z0-9-]", "-", name.lower()).strip("-")
    if not name:
        return None

    skill_dir = skills_dir / name
    skill_dir.mkdir(parents=True, exist_ok=True)
    skill_path = skill_dir / "SKILL.md"

    content = f"---\nname: {name}\n"
    content += f"description: {skill['description']}\n"
    content += "metadata:\n"
    content += f"  category: {skill['category']}\n"
    content += f"---\n{skill['content']}\n"

    skill_path.write_text(content, encoding="utf-8")
    logger.info("Created new MetaClaw skill: %s", name)
    return skill_path


def _severity_at_least(severity: str, min_severity: str) -> bool:
    """Check if severity meets or exceeds the minimum threshold."""
    return _SEVERITY_ORDER.get(severity, 0) >= _SEVERITY_ORDER.get(min_severity, 0)


def convert_lessons_to_skills(
    lessons: list[LessonEntry],
    llm: LLMClient,
    skills_dir: str | Path,
    *,
    min_severity: str = "warning",
    max_skills: int = 3,
) -> list[str]:
    """Convert failure lessons into MetaClaw skills.

    Args:
        lessons: Lessons to convert (will be filtered by severity).
        llm: LLM client for generating skills.
        skills_dir: Path to MetaClaw skills directory.
        min_severity: Minimum severity to include ("info", "warning", "error", "critical").
        max_skills: Maximum number of skills to generate.

    Returns:
        List of created skill names.
    """
    if not lessons:
        return []

    # Filter by severity threshold (>= min_severity)
    filtered = [
        l for l in lessons
        if _severity_at_least(getattr(l, "severity", ""), min_severity)
    ]
    if not filtered:
        logger.info(
            "No lessons at severity >= %s (total lessons: %d)", min_severity, len(lessons)
        )
        return []

    logger.info(
        "Converting %d lessons (severity >= %s) to skills", len(filtered), min_severity
    )

    skills_path = Path(skills_dir).expanduser()
    skills_path.mkdir(parents=True, exist_ok=True)

    categories = ", ".join(sorted(set(LESSON_CATEGORY_TO_SKILL_CATEGORY.values())))
    existing = _list_existing_skill_names(skills_path)

    system = _CONVERSION_PROMPT_SYSTEM.format(categories=categories)
    user = _CONVERSION_PROMPT_USER.format(
        max_skills=max_skills,
        lessons_text=_format_lessons(filtered),
        existing_skills=", ".join(existing[:50]) if existing else "(none)",
    )

    try:
        resp = llm.chat(
            [{"role": "user", "content": user}],
            system=system,
            json_mode=True,
            max_tokens=3000,
        )
    except Exception:
        logger.warning("LLM call for lesson-to-skill conversion failed", exc_info=True)
        return []

    parsed = _parse_skills_response(resp.content)
    if not parsed:
        logger.warning("No valid skills parsed from LLM response")
        return []

    created: list[str] = []
    for skill in parsed[:max_skills]:
        # Map category using our mapping if needed
        if skill["category"] not in LESSON_CATEGORY_TO_SKILL_CATEGORY.values():
            lesson_cat = skill.get("category", "pipeline")
            skill["category"] = LESSON_CATEGORY_TO_SKILL_CATEGORY.get(
                lesson_cat, "research"
            )
        path = _write_skill(skills_path, skill)
        if path is not None:
            created.append(skill["name"])

    return created


================================================
FILE: researchclaw/metaclaw_bridge/prm_gate.py
================================================
"""PRM (Process Reward Model) quality gate for AutoResearchClaw.

Uses an LLM-as-judge approach (compatible with MetaClaw's PRMScorer)
to evaluate the quality of pipeline stage outputs at key gate stages.
"""

from __future__ import annotations

import json
import logging
import os
import re
import urllib.error
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from statistics import mode

logger = logging.getLogger(__name__)

# Stage-specific evaluation instructions
_GATE_INSTRUCTIONS: dict[int, str] = {
    5: (
        "Evaluate the quality of a literature screening result for academic research. "
        "Check: (1) Are the selected papers relevant to the research topic? "
        "(2) Is there sufficient coverage of key approaches? "
        "(3) Are low-quality or irrelevant papers properly filtered out?"
    ),
    9: (
        "Evaluate the quality of an experiment design for academic research. "
        "Check: (1) Are there proper baselines for comparison? "
        "(2) Are ablation studies planned? "
        "(3) Are statistical methods and metrics well-chosen? "
        "(4) Is the experiment reproducible?"
    ),
    15: (
        "Evaluate whether a research PROCEED/PIVOT decision is well-justified. "
        "Check: (1) Is there sufficient evidence to support the decision? "
        "(2) Are alternative interpretations considered? "
        "(3) Is the rationale logically sound?"
    ),
    20: (
        "Evaluate the overall quality of an academic paper. "
        "Check: (1) Is the contribution novel and clearly stated? "
        "(2) Is the methodology sound and well-described? "
        "(3) Do the experiments adequately support the claims? "
        "(4) Is the writing clear and well-structured?"
    ),
}

_JUDGE_SYSTEM = """\
You are a quality reviewer for an automated academic research pipeline.
Based on the evaluation criteria and the provided output, decide:
  +1 = clearly meets quality standards and is ready to proceed
  -1 = fails core requirements or has critical issues
   0 = ambiguous or insufficient evidence to decide

Respond with ONLY "Score: 1", "Score: -1", or "Score: 0" on the first line,
followed by a brief justification."""


def _single_judge_call(
    api_base: str,
    api_key: str,
    model: str,
    instruction: str,
    output_text: str,
    temperature: float,
) -> float | None:
    """Make a single PRM judge call and parse the score."""
    messages = [
        {"role": "system", "content": _JUDGE_SYSTEM},
        {
            "role": "user",
            "content": (
                f"## Evaluation Criteria\n{instruction}\n\n"
                f"## Output to Evaluate\n{output_text[:6000]}"
            ),
        },
    ]
    body = json.dumps({
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_completion_tokens": 512,
    }).encode("utf-8")

    url = f"{api_base.rstrip('/')}/chat/completions"
    req = urllib.request.Request(
        url,
        data=body,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
    )

    try:
        with urllib.request.urlopen(req, timeout=60) as resp:
            data = json.loads(resp.read())
        content = data["choices"][0]["message"]["content"]
        # Parse "Score: X"
        match = re.search(r"Score:\s*([+-]?[01])", content)
        if match:
            return float(match.group(1))
        return None
    except Exception:
        logger.debug("PRM judge call failed", exc_info=True)
        return None


class ResearchPRMGate:
    """PRM quality gate using majority-vote LLM-as-judge scoring."""

    def __init__(
        self,
        api_base: str,
        api_key: str,
        model: str = "gpt-5.4",
        votes: int = 3,
        temperature: float = 0.6,
    ) -> None:
        self.api_base = api_base
        self.api_key = api_key
        self.model = model
        self.votes = votes
        self.temperature = temperature

    @classmethod
    def from_bridge_config(cls, prm_config: object) -> ResearchPRMGate | None:
        """Create from MetaClawBridgeConfig.prm section.

        Returns None if PRM is not enabled or not configured.
        """
        if not getattr(prm_config, "enabled", False):
            return None

        api_key = str(
            getattr(prm_config, "api_key", "")
            or os.environ.get(getattr(prm_config, "api_key_env", ""), "")
            or ""
        )
        api_base = getattr(prm_config, "api_base", "")
        if not api_base or not api_key:
            return None

        return cls(
            api_base=api_base,
            api_key=api_key,
            model=getattr(prm_config, "model", "gpt-5.4"),
            votes=getattr(prm_config, "votes", 3),
            temperature=getattr(prm_config, "temperature", 0.6),
        )

    def evaluate_stage(
        self,
        stage_num: int,
        output_text: str,
        *,
        custom_instruction: str | None = None,
    ) -> float:
        """Evaluate a stage output using majority-vote PRM scoring.

        Args:
            stage_num: Pipeline stage number (5, 9, 15, or 20).
            output_text: The stage output text to evaluate.
            custom_instruction: Override the default evaluation instruction.

        Returns:
            -1.0 (fail), 0.0 (ambiguous), or 1.0 (pass).
        """
        instruction = custom_instruction or _GATE_INSTRUCTIONS.get(
            stage_num,
            "Evaluate the quality and correctness of this research output.",
        )

        # Parallel judge calls
        scores: list[float] = []
        with ThreadPoolExecutor(max_workers=self.votes) as pool:
            futures = [
                pool.submit(
                    _single_judge_call,
                    self.api_base,
                    self.api_key,
                    self.model,
                    instruction,
                    output_text,
                    self.temperature,
                )
                for _ in range(self.votes)
            ]
            for future in as_completed(futures):
                result = future.result()
                if result is not None:
                    scores.append(result)

        if not scores:
            logger.warning("All PRM judge calls failed for stage %d", stage_num)
            return 0.0

        try:
            return float(mode(scores))
        except Exception:
            # Tie — return 0.0 (ambiguous)
            return 0.0

    def should_gate(self, stage_num: int) -> bool:
        """Check if PRM gating is configured for this stage."""
        return stage_num in _GATE_INSTRUCTIONS


================================================
FILE: researchclaw/metaclaw_bridge/session.py
================================================
"""MetaClaw session lifecycle management for AutoResearchClaw.

Manages MetaClaw proxy session headers and lifecycle signals
to enable proper skill evolution and RL training data collection.
"""

from __future__ import annotations

import logging
import uuid

logger = logging.getLogger(__name__)


class MetaClawSession:
    """Manages a MetaClaw session spanning an AutoResearchClaw pipeline run."""

    def __init__(self, run_id: str) -> None:
        self.session_id = f"arc-{run_id}"
        self._active = True
        logger.info("MetaClaw session started: %s", self.session_id)

    def get_headers(self, stage_name: str = "") -> dict[str, str]:
        """Return HTTP headers for MetaClaw proxy requests.

        Args:
            stage_name: Current pipeline stage (for logging/tracking).

        Returns:
            Dict of headers to include in LLM API requests.
        """
        headers: dict[str, str] = {
            "X-Session-Id": self.session_id,
            "X-Turn-Type": "main",
        }
        if stage_name:
            headers["X-AutoRC-Stage"] = stage_name
        return headers

    def end(self) -> dict[str, str]:
        """Return headers that signal session completion.

        Call this when the pipeline run finishes to trigger
        MetaClaw's post-session processing (skill evolution, etc.).
        """
        self._active = False
        logger.info("MetaClaw session ended: %s", self.session_id)
        return {
            "X-Session-Id": self.session_id,
            "X-Session-Done": "true",
            "X-Turn-Type": "main",
        }

    @property
    def is_active(self) -> bool:
        return self._active


================================================
FILE: researchclaw/metaclaw_bridge/skill_feedback.py
================================================
"""Track MetaClaw skill effectiveness across pipeline runs.

Records which skills were active during each stage and correlates
with stage success/failure to identify high/low-value skills.
"""

from __future__ import annotations

import json
import logging
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class SkillEffectivenessRecord:
    """One record of a skill's effectiveness in a pipeline stage."""

    skill_name: str
    stage_name: str
    run_id: str
    stage_success: bool
    timestamp: str


class SkillFeedbackStore:
    """JSONL-backed store for skill effectiveness records."""

    def __init__(self, store_path: Path) -> None:
        self._path = store_path
        self._path.parent.mkdir(parents=True, exist_ok=True)

    def append(self, record: SkillEffectivenessRecord) -> None:
        with self._path.open("a", encoding="utf-8") as f:
            f.write(json.dumps(asdict(record), ensure_ascii=False) + "\n")

    def append_many(self, records: list[SkillEffectivenessRecord]) -> None:
        if not records:
            return
        with self._path.open("a", encoding="utf-8") as f:
            for rec in records:
                f.write(json.dumps(asdict(rec), ensure_ascii=False) + "\n")
        logger.info("Recorded %d skill effectiveness entries", len(records))

    def load_all(self) -> list[SkillEffectivenessRecord]:
        if not self._path.exists():
            return []
        records: list[SkillEffectivenessRecord] = []
        for line in self._path.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if not line:
                continue
            try:
                data = json.loads(line)
                records.append(
                    SkillEffectivenessRecord(
                        skill_name=data["skill_name"],
                        stage_name=data["stage_name"],
                        run_id=data["run_id"],
                        stage_success=data["stage_success"],
                        timestamp=data["timestamp"],
                    )
                )
            except (json.JSONDecodeError, KeyError):
                continue
        return records

    def compute_skill_stats(self) -> dict[str, dict[str, int | float]]:
        """Compute success rate per skill across all recorded runs.

        Returns:
            Dict mapping skill_name to {total, successes, success_rate}.
        """
        records = self.load_all()
        stats: dict[str, dict[str, int]] = {}
        for rec in records:
            if rec.skill_name not in stats:
                stats[rec.skill_name] = {"total": 0, "successes": 0}
            stats[rec.skill_name]["total"] += 1
            if rec.stage_success:
                stats[rec.skill_name]["successes"] += 1

        result: dict[str, dict[str, int | float]] = {}
        for name, counts in stats.items():
            total = counts["total"]
            successes = counts["successes"]
            result[name] = {
                "total": total,
                "successes": successes,
                "success_rate": successes / total if total > 0 else 0.0,
            }
        return result


def record_stage_skills(
    store: SkillFeedbackStore,
    stage_name: str,
    run_id: str,
    stage_success: bool,
    active_skills: list[str],
) -> None:
    """Record effectiveness of all active skills for a completed stage."""
    now = datetime.now(timezone.utc).isoformat(timespec="seconds")
    records = [
        SkillEffectivenessRecord(
            skill_name=skill,
            stage_name=stage_name,
            run_id=run_id,
            stage_success=stage_success,
            timestamp=now,
        )
        for skill in active_skills
    ]
    store.append_many(records)


================================================
FILE: researchclaw/metaclaw_bridge/stage_skill_map.py
================================================
"""Maps AutoResearchClaw pipeline stages to MetaClaw skill categories.

Each stage maps to:
- task_type: MetaClaw's task category for skill retrieval
- skills: Preferred research-specific skills to inject
- top_k: Number of skills to inject at this stage
"""

from __future__ import annotations

from typing import Any

STAGE_SKILL_MAP: dict[str, dict[str, Any]] = {
    "topic_init": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 4,
    },
    "problem_decompose": {
        "task_type": "research",
        "skills": ["research-gap-identification"],
        "top_k": 4,
    },
    "search_strategy": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 6,
    },
    "literature_collect": {
        "task_type": "research",
        "skills": ["literature-search-strategy"],
        "top_k": 4,
    },
    "literature_screen": {
        "task_type": "research",
        "skills": ["paper-relevance-screening"],
        "top_k": 6,
    },
    "knowledge_extract": {
        "task_type": "research",
        "skills": ["knowledge-card-extraction"],
        "top_k": 4,
    },
    "synthesis": {
        "task_type": "research",
        "skills": ["research-gap-identification"],
        "top_k": 6,
    },
    "hypothesis_gen": {
        "task_type": "research",
        "skills": ["hypothesis-formulation"],
        "top_k": 6,
    },
    "experiment_design": {
        "task_type": "research",
        "skills": ["experiment-design-rigor"],
        "top_k": 6,
    },
    "code_generation": {
        "task_type": "coding",
        "skills": ["hardware-aware-coding"],
        "top_k": 6,
    },
    "resource_planning": {
        "task_type": "productivity",
        "skills": [],
        "top_k": 3,
    },
    "experiment_run": {
        "task_type": "automation",
        "skills": ["experiment-debugging"],
        "top_k": 4,
    },
    "iterative_refine": {
        "task_type": "coding",
        "skills": ["experiment-debugging"],
        "top_k": 6,
    },
    "result_analysis": {
        "task_type": "data_analysis",
        "skills": ["statistical-analysis"],
        "top_k": 6,
    },
    "research_decision": {
        "task_type": "research",
        "skills": ["research-pivot-decision"],
        "top_k": 4,
    },
    "paper_outline": {
        "task_type": "communication",
        "skills": ["academic-writing-structure"],
        "top_k": 4,
    },
    "paper_draft": {
        "task_type": "communication",
        "skills": ["academic-writing-structure"],
        "top_k": 6,
    },
    "peer_review": {
        "task_type": "communication",
        "skills": ["peer-review-methodology"],
        "top_k": 6,
    },
    "paper_revision": {
        "task_type": "communication",
        "skills": ["academic-writing-structure", "peer-review-methodology"],
        "top_k": 6,
    },
    "quality_gate": {
        "task_type": "research",
        "skills": ["peer-review-methodology"],
        "top_k": 4,
    },
    "knowledge_archive": {
        "task_type": "automation",
        "skills": [],
        "top_k": 2,
    },
    "export_publish": {
        "task_type": "automation",
        "skills": [],
        "top_k": 2,
    },
    "citation_verify": {
        "task_type": "research",
        "skills": ["citation-integrity"],
        "top_k": 4,
    },
}

# Mapping from AutoResearchClaw lesson categories to skill categories.
# Uses the new taxonomy: writing, domain, experiment, tooling.
LESSON_CATEGORY_TO_SKILL_CATEGORY: dict[str, str] = {
    "system": "tooling",
    "experiment": "experiment",
    "writing": "writing",
    "analysis": "experiment",
    "literature": "experiment",
    "pipeline": "tooling",
}


def get_stage_config(stage_name: str) -> dict[str, Any]:
    """Return the MetaClaw skill config for a given pipeline stage.

    Falls back to a generic research config if the stage is unknown.
    """
    return STAGE_SKILL_MAP.get(
        stage_name,
        {"task_type": "research", "skills": [], "top_k": 4},
    )


================================================
FILE: researchclaw/overleaf/__init__.py
================================================
"""Overleaf bidirectional sync for AutoResearchClaw."""

from researchclaw.overleaf.sync import OverleafSync
from researchclaw.overleaf.conflict import ConflictResolver
from researchclaw.overleaf.watcher import FileWatcher
from researchclaw.overleaf.formatter import LatexFormatter

__all__ = ["OverleafSync", "ConflictResolver", "FileWatcher", "LatexFormatter"]


================================================
FILE: researchclaw/overleaf/conflict.py
================================================
"""Section-level conflict detection and resolution for LaTeX files."""

from __future__ import annotations

import logging
import re
import subprocess
from pathlib import Path

logger = logging.getLogger(__name__)

# LaTeX section commands in order of depth
_SECTION_PATTERN = re.compile(
    r"^\\(part|chapter|section|subsection|subsubsection)\b",
    re.MULTILINE,
)
_CONFLICT_MARKER = re.compile(r"^<<<<<<<\s", re.MULTILINE)


class ConflictResolver:
    """Detect and resolve merge conflicts at the LaTeX section level."""

    def has_conflicts(self, repo_dir: Path) -> bool:
        """Check if there are unresolved merge conflicts."""
        for tex in repo_dir.glob("**/*.tex"):
            content = tex.read_text(encoding="utf-8", errors="replace")
            if _CONFLICT_MARKER.search(content):
                return True
        return False

    def detect(self, repo_dir: Path) -> list[dict[str, str]]:
        """Find all conflict regions and which sections they belong to."""
        conflicts: list[dict[str, str]] = []
        for tex in repo_dir.glob("**/*.tex"):
            content = tex.read_text(encoding="utf-8", errors="replace")
            file_conflicts = _extract_conflicts(content)
            for c in file_conflicts:
                c["file"] = str(tex.relative_to(repo_dir))
            conflicts.extend(file_conflicts)
        return conflicts

    def resolve(self, repo_dir: Path, strategy: str = "ours") -> list[str]:
        """Resolve all conflicts in .tex files using the given strategy.

        strategy: "ours" keeps the local (AI) version,
                  "theirs" keeps the remote (human) version
        """
        resolved_files: list[str] = []
        for tex in repo_dir.glob("**/*.tex"):
            content = tex.read_text(encoding="utf-8", errors="replace")
            if not _CONFLICT_MARKER.search(content):
                continue
            resolved = _resolve_content(content, strategy)
            tex.write_text(resolved, encoding="utf-8")
            resolved_files.append(str(tex.relative_to(repo_dir)))
            logger.info("Resolved conflicts in %s (strategy=%s)", tex.name, strategy)
        return resolved_files


def _extract_conflicts(content: str) -> list[dict[str, str]]:
    """Extract conflict regions from file content."""
    conflicts: list[dict[str, str]] = []
    in_conflict = False
    ours_lines: list[str] = []
    theirs_lines: list[str] = []
    current = ours_lines

    for line in content.splitlines():
        if line.startswith("<<<<<<<"):
            in_conflict = True
            ours_lines = []
            theirs_lines = []
            current = ours_lines
        elif line.startswith("=======") and in_conflict:
            current = theirs_lines
        elif line.startswith(">>>>>>>") and in_conflict:
            conflicts.append({
                "ours": "\n".join(ours_lines),
                "theirs": "\n".join(theirs_lines),
            })
            in_conflict = False
        elif in_conflict:
            current.append(line)
    return conflicts


def _resolve_content(content: str, strategy: str) -> str:
    """Replace conflict markers with the chosen side."""
    lines = content.splitlines(keepends=True)
    result: list[str] = []
    state = "normal"  # normal | ours | theirs

    for line in lines:
        if line.startswith("<<<<<<<"):
            state = "ours"
        elif line.startswith("=======") and state == "ours":
            state = "theirs"
        elif line.startswith(">>>>>>>") and state == "theirs":
            state = "normal"
        else:
            if state == "normal":
                result.append(line)
            elif state == "ours" and strategy == "ours":
                result.append(line)
            elif state == "theirs" and strategy == "theirs":
                result.append(line)
    return "".join(result)


================================================
FILE: researchclaw/overleaf/formatter.py
================================================
"""LaTeX formatting adapter for Overleaf compatibility."""

from __future__ import annotations

import re
from pathlib import Path


class LatexFormatter:
    """Adapt pipeline LaTeX output for Overleaf conventions."""

    @staticmethod
    def normalize_paths(content: str, figures_prefix: str = "figures/") -> str:
        """Normalize figure paths to use Overleaf-style relative paths."""
        # Replace absolute or deep-nested paths with flat figures/ prefix
        content = re.sub(
            r"\\includegraphics(\[.*?\])?\{[^}]*?([^/}]+\.(?:png|pdf|jpg|eps))\}",
            lambda m: f"\\includegraphics{m.group(1) or ''}{{{figures_prefix}{m.group(2)}}}",
            content,
        )
        return content

    @staticmethod
    def ensure_document_class(content: str) -> str:
        """Ensure the file has a \\documentclass declaration."""
        if "\\documentclass" not in content:
            content = "\\documentclass{article}\n" + content
        return content

    @staticmethod
    def strip_local_comments(content: str) -> str:
        """Remove AutoResearchClaw-internal comments from LaTeX."""
        lines = content.splitlines(keepends=True)
        return "".join(
            line for line in lines
            if not line.strip().startswith("% RESEARCHCLAW:")
        )

    @staticmethod
    def fix_encoding(content: str) -> str:
        """Ensure UTF-8 input encoding package is declared."""
        if "\\usepackage[utf8]{inputenc}" not in content and "\\usepackage{inputenc}" not in content:
            # Insert after documentclass
            content = re.sub(
                r"(\\documentclass.*?\n)",
                r"\1\\usepackage[utf8]{inputenc}\n",
                content,
                count=1,
            )
        return content

    def format_for_overleaf(self, tex_path: Path) -> str:
        """Apply all formatting steps to a LaTeX file."""
        content = tex_path.read_text(encoding="utf-8")
        content = self.ensure_document_class(content)
        content = self.fix_encoding(content)
        content = self.normalize_paths(content)
        content = self.strip_local_comments(content)
        return content


================================================
FILE: researchclaw/overleaf/sync.py
================================================
"""Overleaf Git-based bidirectional sync engine."""

from __future__ import annotations

import logging
import shutil
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

from researchclaw.overleaf.conflict import ConflictResolver

logger = logging.getLogger(__name__)


class OverleafSync:
    """Bidirectional sync between pipeline output and Overleaf via Git."""

    def __init__(
        self,
        git_url: str,
        branch: str = "main",
        auto_push: bool = True,
        auto_pull: bool = False,
    ) -> None:
        self.git_url = git_url
        self.branch = branch
        self.auto_push = auto_push
        self.auto_pull = auto_pull
        self.local_dir: Path | None = None
        self._last_sync: datetime | None = None
        self._conflict_resolver = ConflictResolver()

    def setup(self, run_dir: Path) -> Path:
        """Clone or update the Overleaf repo into the run directory."""
        self.local_dir = run_dir / "overleaf_repo"
        if self.local_dir.exists() and (self.local_dir / ".git").exists():
            logger.info("Pulling latest from Overleaf...")
            self._git("pull", "origin", self.branch)
        else:
            logger.info("Cloning Overleaf repo: %s", self.git_url)
            self.local_dir.mkdir(parents=True, exist_ok=True)
            self._git_clone()
        return self.local_dir

    def push_paper(
        self,
        paper_tex: Path,
        bib_file: Path | None = None,
        figures_dir: Path | None = None,
    ) -> bool:
        """Push pipeline-generated paper to Overleaf.

        Copies .tex, .bib, and figures into the local clone, then commits and pushes.
        """
        if not self.local_dir:
            raise RuntimeError("Call setup() before push_paper()")

        # Copy main tex file
        dst_tex = self.local_dir / paper_tex.name
        shutil.copy2(paper_tex, dst_tex)
        logger.info("Copied %s -> %s", paper_tex, dst_tex)

        # Copy bib file
        if bib_file and bib_file.exists():
            dst_bib = self.local_dir / bib_file.name
            shutil.copy2(bib_file, dst_bib)

        # Copy figures
        if figures_dir and figures_dir.is_dir():
            dst_figs = self.local_dir / "figures"
            if dst_figs.exists():
                shutil.rmtree(dst_figs)
            shutil.copytree(figures_dir, dst_figs)

        # Git add, commit, push
        self._git("add", "-A")
        status = self._git("status", "--porcelain")
        if not status.strip():
            logger.info("No changes to push")
            return False

        ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
        self._git("commit", "-m", f"AutoResearchClaw sync: {ts}")
        self._git("push", "origin", self.branch)
        self._last_sync = datetime.now(timezone.utc)
        logger.info("Pushed paper to Overleaf")
        return True

    def pull_changes(self) -> list[str]:
        """Pull human edits from Overleaf and return changed file names."""
        if not self.local_dir:
            raise RuntimeError("Call setup() before pull_changes()")

        # Record current HEAD
        old_head = self._git("rev-parse", "HEAD").strip()
        self._git("pull", "origin", self.branch)
        new_head = self._git("rev-parse", "HEAD").strip()

        if old_head == new_head:
            return []

        # Get list of changed files
        diff_output = self._git("diff", "--name-only", old_head, new_head)
        changed = [f.strip() for f in diff_output.splitlines() if f.strip()]
        self._last_sync = datetime.now(timezone.utc)
        return changed

    def get_status(self) -> dict[str, Any]:
        """Return sync status information."""
        status: dict[str, Any] = {
            "git_url": self.git_url,
            "branch": self.branch,
            "local_dir": str(self.local_dir) if self.local_dir else None,
            "last_sync": self._last_sync.isoformat() if self._last_sync else None,
            "auto_push": self.auto_push,
            "auto_pull": self.auto_pull,
        }
        if self.local_dir and (self.local_dir / ".git").exists():
            pending = self._git("status", "--porcelain").strip()
            status["pending_changes"] = len(pending.splitlines()) if pending else 0
        return status

    def resolve_conflicts(self, strategy: str = "ours") -> list[str]:
        """Resolve merge conflicts using the given strategy."""
        if not self.local_dir:
            raise RuntimeError("Call setup() before resolve_conflicts()")
        return self._conflict_resolver.resolve(self.local_dir, strategy)

    # ── git helpers ───────────────────────────────────────────────

    def _git(self, *args: str) -> str:
        """Run a git command in the local repo directory."""
        cmd = ["git", "-C", str(self.local_dir), *args]
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=60,
        )
        if result.returncode != 0 and "conflict" not in result.stderr.lower():
            raise RuntimeError(f"git {' '.join(args)} failed: {result.stderr.strip()}")
        return result.stdout

    def _git_clone(self) -> None:
        """Clone the Overleaf repo."""
        result = subprocess.run(
            ["git", "clone", "-b", self.branch, self.git_url, str(self.local_dir)],
            capture_output=True,
            text=True,
            timeout=120,
        )
        if result.returncode != 0:
            raise RuntimeError(f"git clone failed: {result.stderr.strip()}")


================================================
FILE: researchclaw/overleaf/watcher.py
================================================
"""File change watcher for Overleaf sync polling."""

from __future__ import annotations

import logging
import time
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


class FileWatcher:
    """Watch a directory for file changes (poll-based)."""

    def __init__(self, watch_dir: Path, extensions: tuple[str, ...] = (".tex", ".bib")) -> None:
        self.watch_dir = watch_dir
        self.extensions = extensions
        self._snapshot: dict[str, float] = {}
        self._take_snapshot()

    def _take_snapshot(self) -> None:
        """Record modification times of all tracked files."""
        self._snapshot = {}
        if not self.watch_dir.exists():
            return
        for ext in self.extensions:
            for f in self.watch_dir.rglob(f"*{ext}"):
                self._snapshot[str(f.relative_to(self.watch_dir))] = f.stat().st_mtime

    def check_changes(self) -> list[str]:
        """Return files that have changed since the last snapshot."""
        changed: list[str] = []
        current: dict[str, float] = {}
        if not self.watch_dir.exists():
            return changed

        for ext in self.extensions:
            for f in self.watch_dir.rglob(f"*{ext}"):
                rel = str(f.relative_to(self.watch_dir))
                mtime = f.stat().st_mtime
                current[rel] = mtime
                old_mtime = self._snapshot.get(rel)
                if old_mtime is None or mtime > old_mtime:
                    changed.append(rel)

        # Check for deleted files
        for rel in self._snapshot:
            if rel not in current:
                changed.append(rel)

        self._snapshot = current
        return changed

    def poll_loop(self, interval_sec: int = 300, callback: Any = None) -> None:
        """Blocking poll loop that calls callback on changes.

        This is meant to be run in a background thread.
        """
        logger.info("Starting file watcher on %s (interval=%ds)", self.watch_dir, interval_sec)
        while True:
            time.sleep(interval_sec)
            changes = self.check_changes()
            if changes and callback:
                callback(changes)


================================================
FILE: researchclaw/pipeline/__init__.py
================================================
"""Pipeline core — 23-stage research pipeline."""


================================================
FILE: researchclaw/pipeline/_domain.py
================================================
"""Domain detection — maps research topic to academic domain & venue context."""

from __future__ import annotations

_DOMAIN_KEYWORDS: dict[str, tuple[list[str], str, str]] = {
    # domain_id: (keywords, display_name, top_venues)
    "ml": (
        ["machine learning", "deep learning", "neural network", "transformer",
         "reinforcement learning", "GAN", "diffusion model", "LLM", "language model",
         "computer vision", "NLP", "representation learning", "self-supervised",
         "federated learning", "meta-learning", "continual learning", "few-shot",
         "knowledge distillation", "attention mechanism", "fine-tuning", "RLHF",
         "vision transformer", "ViT", "BERT", "GPT", "autoencoder"],
        "machine learning",
        "NeurIPS, ICML, ICLR",
    ),
    "physics": (
        ["quantum", "thermodynamic", "electrodynamic", "particle physics",
         "condensed matter", "statistical mechanics", "cosmology", "astrophysics",
         "plasma", "optics", "photonics", "relativity", "gravitational",
         "PDE", "PINN", "physics-informed", "Burgers", "Navier-Stokes",
         "Darcy flow", "Schrödinger", "scientific computing", "operator learning",
         "neural operator", "Fourier neural", "DeepONet"],
        "physics",
        "Physical Review Letters, Nature Physics, JHEP",
    ),
    "chemistry": (
        ["molecular", "catalysis", "polymer", "organic chemistry", "inorganic",
         "electrochemistry", "spectroscopy", "crystallography", "drug discovery",
         "protein folding", "computational chemistry", "DFT", "force field"],
        "chemistry",
        "JACS, Nature Chemistry, Angewandte Chemie",
    ),
    "economics": (
        ["econometric", "macroeconomic", "microeconomic", "game theory",
         "market", "fiscal policy", "monetary", "behavioral economics",
         "causal inference", "panel data", "regression discontinuity",
         "instrumental variable", "supply chain", "auction"],
        "economics",
        "AER, Econometrica, QJE, Review of Economic Studies",
    ),
    "mathematics": (
        ["theorem", "proof", "prove", "conjecture", "topology", "algebra",
         "number theory", "combinatorics", "differential equation",
         "stochastic process", "functional analysis", "manifold",
         "Riemannian", "category theory", "graph theory",
         "neural ODE", "dynamical system", "Lorenz", "chaotic",
         "Lyapunov", "attractor", "ODE solver", "trajectory prediction",
         "mathematical formulation", "mathematical proof", "derivation",
         "Brownian motion", "branching process", "Galton-Watson",
         "Markov chain", "martingale", "ergodic", "convergence theorem",
         "marginal distribution", "extinction probability", "Feynman-Kac",
         "measure theory", "Hilbert space", "Banach space", "operator theory",
         "variational", "Euler-Lagrange", "calculus of variations"],
        "mathematics",
        "Annals of Mathematics, Inventiones Mathematicae, JAMS",
    ),
    "engineering": (
        ["robotics", "control system", "signal processing", "FPGA",
         "embedded system", "VLSI", "antenna", "fluid dynamics", "CFD",
         "finite element", "structural", "mechatronics", "autonomous"],
        "engineering",
        "IEEE Transactions, ASME journals, AIAA",
    ),
    "biology": (
        ["genomics", "proteomics", "transcriptomics", "CRISPR",
         "single-cell", "phylogenetic", "ecology", "neuroscience",
         "bioinformatics", "sequencing", "gene expression", "epigenetic"],
        "biology",
        "Nature, Science, Cell, PNAS",
    ),
}


def _detect_domain(topic: str, domains: tuple[str, ...] = ()) -> tuple[str, str, str]:
    """Detect research domain from topic string and config domains.

    Returns ``(domain_id, display_name, top_venues)``.
    Falls back to ``("ml", "machine learning", "NeurIPS, ICML, ICLR")``.
    """
    # If user explicitly specified domains, check them first
    for d in domains:
        d_lower = d.lower().strip()
        for did, (kws, dname, venues) in _DOMAIN_KEYWORDS.items():
            if d_lower in (did, dname) or any(k in d_lower for k in kws[:3]):
                return did, dname, venues

    # Auto-detect from topic text
    topic_lower = topic.lower()
    best_did, best_score = "ml", 0
    # BUG-101: Explicit theoretical intent words boost non-empirical domain scores.
    # Topics like "derive the mathematical formulation of X diffusion model"
    # should classify as math, not ML, even if "diffusion model" is an ML keyword.
    _theoretical_intent = any(
        w in topic_lower
        for w in ("derive", "prove", "mathematical formulation",
                  "mathematical proof", "formal proof", "formalism")
    )
    for did, (kws, dname, venues) in _DOMAIN_KEYWORDS.items():
        score = sum(1 for k in kws if k.lower() in topic_lower)
        # Boost non-empirical domains when theoretical intent is detected
        if _theoretical_intent and did in ("mathematics", "physics", "economics"):
            score += 1
        if score > best_score:
            best_score = score
            best_did = did

    did = best_did
    _, dname, venues = _DOMAIN_KEYWORDS[did]
    return did, dname, venues


def _is_ml_domain(domain_id: str) -> bool:
    """Check if the detected domain is ML/AI."""
    return domain_id == "ml"


================================================
FILE: researchclaw/pipeline/_helpers.py
================================================
"""Shared constants, data classes, and utility functions for the pipeline executor."""

from __future__ import annotations

import json
import logging
import math
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import yaml

from researchclaw.config import RCConfig
from researchclaw.hardware import HardwareProfile, is_metric_name
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline.stages import (
    NEXT_STAGE,
    Stage,
    StageStatus,
)
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class StageResult:
    """Outcome of executing a single stage."""

    stage: Stage
    status: StageStatus
    artifacts: tuple[str, ...]
    error: str | None = None
    decision: str = "proceed"
    evidence_refs: tuple[str, ...] = ()


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

_SANDBOX_SAFE_PACKAGES = {
    "numpy", "scipy", "torch", "sklearn", "matplotlib",
    "pandas", "seaborn", "tqdm", "gymnasium", "gym",
}

_METACLAW_SKILLS_DIR = str(Path.home() / ".metaclaw" / "skills")

# --- P1-1: Topic keyword extraction for domain pre-filter ---
_STOP_WORDS = frozenset(
    {
        "a",
        "an",
        "the",
        "and",
        "or",
        "but",
        "in",
        "on",
        "of",
        "for",
        "to",
        "with",
        "by",
        "at",
        "from",
        "as",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "will",
        "would",
        "could",
        "should",
        "may",
        "might",
        "can",
        "shall",
        "not",
        "no",
        "nor",
        "so",
        "yet",
        "both",
        "each",
        "every",
        "all",
        "any",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "than",
        "too",
        "very",
        "just",
        "about",
        "above",
        "after",
        "again",
        "between",
        "into",
        "through",
        "during",
        "before",
        "after",
        "under",
        "over",
        "using",
        "based",
        "via",
        "toward",
        "towards",
        "new",
        "novel",
        "approach",
        "method",
        "study",
        "research",
        "paper",
        "work",
        "propose",
        "proposed",
    }
)

# ---------------------------------------------------------------------------
# Timestamp utility
# ---------------------------------------------------------------------------


def _utcnow_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")


# ---------------------------------------------------------------------------
# Fallback query builder
# ---------------------------------------------------------------------------


def _build_fallback_queries(topic: str) -> list[str]:
    """Extract meaningful search queries from a long topic string.

    Instead of using the raw topic as a query (which is often 200+ chars
    and returns garbage from search engines), extract noun phrases and
    domain keywords. Returns 5-10 targeted queries.
    """
    # Split on common delimiters and extract meaningful chunks
    chunks = re.split(r"[,:;()\[\]]+", topic)
    chunks = [c.strip() for c in chunks if len(c.strip()) > 8]
    cleaned_chunks = []
    for c in chunks:
        c = re.sub(
            r"^(and|or|the|a|an|in|of|for|with|across|multiple|three|various)\s+",
            "", c, flags=re.IGNORECASE,
        )
        c = c.strip()
        if len(c) > 8:
            cleaned_chunks.append(c)
    chunks = cleaned_chunks

    # Extract key terms (words that look like domain terms, not stopwords)
    _stop = {
        "the", "and", "for", "with", "from", "that", "this", "into",
        "over", "across", "multiple", "three", "result", "comprehensive",
        "using", "based", "between", "various", "different", "several",
        "parameter", "parameters", "analysis", "approach", "method",
        "framework", "frameworks",
    }
    words = topic.lower().split()
    key_terms = [w for w in words if len(w) > 3 and w not in _stop]

    queries: list[str] = []

    # Strategy 1: Use meaningful chunks (up to 60 chars each)
    for chunk in chunks[:4]:
        if len(chunk) > 60:
            chunk = " ".join(chunk.split()[:6])
        if chunk and chunk not in queries:
            queries.append(chunk)

    # Strategy 2: Bigrams of key terms
    clean_terms = [t for t in key_terms if re.match(r"^[a-z]", t) and ":" not in t]
    for i in range(min(len(clean_terms) - 1, 4)):
        bigram = f"{clean_terms[i]} {clean_terms[i + 1]}"
        if bigram not in queries:
            queries.append(bigram)

    # Deduplicate preserving order
    seen: set[str] = set()
    unique: list[str] = []
    for q in queries:
        q_lower = q.strip().lower()
        if q_lower and q_lower not in seen:
            seen.add(q_lower)
            unique.append(q.strip())

    # Ensure we have at least a few useful queries
    topic_short = topic[:60].strip()
    for suffix in ("survey", "review", "benchmark", "state of the art", "recent advances"):
        if len(unique) >= 5:
            break
        candidate = f"{topic_short} {suffix}".strip()
        if candidate.lower() not in seen:
            seen.add(candidate.lower())
            unique.append(candidate)

    return unique[:10]


# ---------------------------------------------------------------------------
# Stage metadata I/O
# ---------------------------------------------------------------------------


def _write_stage_meta(
    stage_dir: Path, stage: Stage, run_id: str, result: "StageResult"
) -> None:
    next_stage = NEXT_STAGE[stage]
    meta = {
        "stage_id": f"{int(stage):02d}-{stage.name.lower()}",
        "run_id": run_id,
        "status": result.status.value,
        "decision": result.decision,
        "output_artifacts": list(result.artifacts),
        "evidence_refs": list(result.evidence_refs),
        "error": result.error,
        "ts": _utcnow_iso(),
        "next_stage": int(next_stage) if next_stage is not None else None,
    }
    (stage_dir / "decision.json").write_text(
        json.dumps(meta, indent=2), encoding="utf-8"
    )


# ---------------------------------------------------------------------------
# Sandbox dependency helper
# ---------------------------------------------------------------------------


def _ensure_sandbox_deps(code: str, python_path: str) -> list[str]:
    """P7: Scan code imports and auto-install missing common packages."""
    import subprocess as _sp

    imports: set[str] = set()
    for line in code.splitlines():
        m = re.match(r"^(?:from|import)\s+(\w+)", line.strip())
        if m:
            imports.add(m.group(1))

    to_check = imports & _SANDBOX_SAFE_PACKAGES
    if not to_check:
        return []

    py = python_path
    py_path = Path(py)
    if not py_path.is_absolute():
        py_path = Path.cwd() / py_path

    installed: list[str] = []
    for pkg in sorted(to_check):
        try:
            r = _sp.run(
                [str(py_path), "-c", f"import {pkg}"],
                capture_output=True, timeout=10,
            )
            if r.returncode != 0:
                pip_name = "scikit-learn" if pkg == "sklearn" else pkg
                logger.info("Sandbox: installing missing dependency '%s'", pip_name)
                _sp.run(
                    [str(py_path), "-m", "pip", "install", pip_name, "--quiet"],
                    capture_output=True, timeout=120,
                )
                installed.append(pip_name)
        except Exception as exc:
            logger.warning("Sandbox: failed to check/install '%s': %s", pkg, exc)

    if installed:
        logger.info("Sandbox: auto-installed packages: %s", ", ".join(installed))
    return installed


# ---------------------------------------------------------------------------
# Prior artifact I/O
# ---------------------------------------------------------------------------


def _read_best_analysis(run_dir: Path) -> str:
    """BUG-225: Read analysis.md from the best Stage 14 iteration.

    Prefers ``analysis_best.md`` at run root (written by
    ``_promote_best_stage14``) over ``_read_prior_artifact("analysis.md")``
    which may pick a degenerate non-versioned stage-14 directory.
    """
    best = run_dir / "analysis_best.md"
    if best.exists():
        return best.read_text(encoding="utf-8")
    return _read_prior_artifact(run_dir, "analysis.md") or ""


def _read_prior_artifact(run_dir: Path, filename: str) -> str | None:
    # R14-2: Sort so non-versioned dirs (stage-13) come before versioned (stage-13_v1).
    # Within the same stage number, prefer the latest (non-versioned) copy.
    def _stage_sort_key(p: Path) -> tuple[str, int]:
        name = p.name
        # Extract base stage name and version
        if "_v" in name:
            base, _, ver = name.rpartition("_v")
            try:
                return (base, -int(ver))  # Versioned: lower priority (negative version)
            except ValueError:
                return (name, -999)
        return (name, 0)  # Non-versioned: highest priority

    for stage_subdir in sorted(run_dir.glob("stage-*"), key=_stage_sort_key, reverse=True):
        candidate = stage_subdir / filename
        if candidate.is_file():
            return candidate.read_text(encoding="utf-8")
        if filename.endswith("/") and (stage_subdir / filename.rstrip("/")).is_dir():
            return str(stage_subdir / filename.rstrip("/"))
    return None


def _find_prior_file(run_dir: Path, filename: str) -> Path | None:
    """Like ``_read_prior_artifact`` but returns the *Path* instead of content."""
    def _stage_sort_key(p: Path) -> tuple[str, int]:
        name = p.name
        if "_v" in name:
            base, _, ver = name.rpartition("_v")
            try:
                return (base, -int(ver))
            except ValueError:
                return (name, -999)
        return (name, 0)

    for stage_subdir in sorted(run_dir.glob("stage-*"), key=_stage_sort_key, reverse=True):
        candidate = stage_subdir / filename
        if candidate.is_file():
            return candidate
    return None


def _load_hardware_profile(run_dir: Path) -> dict[str, Any] | None:
    """Load hardware_profile.json from a prior stage (usually stage-01)."""
    raw = _read_prior_artifact(run_dir, "hardware_profile.json")
    if raw is None:
        return None
    try:
        data = json.loads(raw)
        return data if isinstance(data, dict) else None
    except (json.JSONDecodeError, ValueError):
        return None


# ---------------------------------------------------------------------------
# Parsing utilities
# ---------------------------------------------------------------------------


def _extract_yaml_block(text: str) -> str:
    """Extract YAML from text that may contain ACP noise.

    Strips [thinking] blocks, insight blocks, and other ACP artifacts
    before looking for YAML in markdown fences or raw text.
    """
    # Strip ACP noise: [thinking]..., insight blocks, [plan]...
    cleaned = re.sub(
        r"\[thinking\].*?(?=\n```|\n[A-Z]|\Z)",
        "", text, flags=re.DOTALL,
    )
    cleaned = re.sub(r"\[plan\].*?\n\n", "", cleaned, flags=re.DOTALL)

    # Try markdown fences first (most reliable) — on cleaned text
    if "```yaml" in cleaned:
        return cleaned.split("```yaml", 1)[1].split("```", 1)[0].strip()
    if "```yml" in cleaned:
        return cleaned.split("```yml", 1)[1].split("```", 1)[0].strip()
    if "```" in cleaned:
        block = cleaned.split("```", 1)[1].split("```", 1)[0].strip()
        if block:
            return block

    # Try the original text too (in case cleaning removed too much)
    if "```yaml" in text:
        return text.split("```yaml", 1)[1].split("```", 1)[0].strip()
    if "```yml" in text:
        return text.split("```yml", 1)[1].split("```", 1)[0].strip()
    if "```" in text:
        block = text.split("```", 1)[1].split("```", 1)[0].strip()
        if block:
            return block

    # Last resort: try to find YAML-like content (lines starting with key:)
    yaml_lines: list[str] = []
    in_yaml = False
    for line in cleaned.splitlines():
        stripped = line.strip()
        if not in_yaml and re.match(r"^[a-z_]+:", stripped):
            in_yaml = True
        if in_yaml:
            if stripped and not stripped.startswith("#"):
                yaml_lines.append(line)
            elif not stripped and yaml_lines:
                yaml_lines.append(line)
    if yaml_lines:
        return "\n".join(yaml_lines).strip()

    return text.strip()


def _safe_json_loads(text: str, default: Any) -> Any:
    """Parse JSON from text, handling noisy ACP output.

    Tries multiple strategies: direct parse, markdown fence extraction,
    balanced brace matching (largest dict wins), and array brackets.
    """
    if not text or not text.strip():
        return default

    # Strategy 1: Direct parse
    try:
        return json.loads(text)
    except (json.JSONDecodeError, ValueError, RecursionError):
        pass

    # Strategy 2: Find JSON in markdown code fences
    fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
    for match in fence_pattern.finditer(text):
        candidate = match.group(1).strip()
        try:
            return json.loads(candidate)
        except (json.JSONDecodeError, ValueError):
            continue

    # Strategy 3: Find outermost balanced braces
    brace_depth = 0
    start = -1
    candidates: list[str] = []
    for i, ch in enumerate(text):
        if ch == "{":
            if brace_depth == 0:
                start = i
            brace_depth += 1
        elif ch == "}":
            brace_depth -= 1
            if brace_depth == 0 and start >= 0:
                candidates.append(text[start : i + 1])
                start = -1

    # Try candidates from largest to smallest
    candidates.sort(key=len, reverse=True)
    for candidate in candidates:
        try:
            parsed = json.loads(candidate)
            if isinstance(parsed, dict):
                return parsed
        except (json.JSONDecodeError, ValueError):
            continue

    # Strategy 4: Same for array [ ]
    bracket_depth = 0
    start = -1
    for i, ch in enumerate(text):
        if ch == "[":
            if bracket_depth == 0:
                start = i
            bracket_depth += 1
        elif ch == "]":
            bracket_depth -= 1
            if bracket_depth == 0 and start >= 0:
                try:
                    parsed = json.loads(text[start : i + 1])
                    if isinstance(parsed, list):
                        return parsed
                except (json.JSONDecodeError, ValueError):
                    pass
                start = -1

    return default


def _extract_code_block(content: str) -> str:
    match = re.search(r"```(?:python)?\s*(.*?)\s*```", content, flags=re.DOTALL)
    if match is not None:
        return match.group(1).strip()
    return content.strip()


def _extract_multi_file_blocks(content: str) -> dict[str, str]:
    """Parse LLM response containing multiple files with filename markers.

    Expected format::

        ```filename:main.py
        import model
        ...
        ```

        ```filename:model.py
        class MyModel:
        ...
        ```

    Also handles common LLM format variations:
    - ````` ```python filename:main.py````` (space before filename)
    - ````` ``` filename:main.py````` (space after backticks)
    - ``filename:main.py`` on next line after backticks
    - ``# FILE: main.py`` comment markers inside code blocks

    Falls back to treating the entire code block as ``main.py`` if no
    ``filename:`` markers are found.

    Returns a dict mapping filename → code content.
    """
    # R13-2: Multiple patterns to handle LLM format variations
    patterns = [
        # Original: ```filename:xxx.py or ```python filename:xxx.py
        re.compile(
            r"```(?:python\s+)?filename:(\S+)\s*\n(.*?)```",
            flags=re.DOTALL,
        ),
        # Variation: ``` filename:xxx.py (space after backticks)
        re.compile(
            r"```\s+filename:(\S+)\s*\n(.*?)```",
            flags=re.DOTALL,
        ),
        # Variation: ```python\nfilename:xxx.py (filename on next line)
        re.compile(
            r"```(?:python)?\s*\nfilename:(\S+)\s*\n(.*?)```",
            flags=re.DOTALL,
        ),
        # Variation: ```python\n# filename: xxx.py (comment marker)
        re.compile(
            r"```(?:python)?\s*\n#\s*(?:FILE|filename)\s*:\s*(\S+\.py)\s*\n(.*?)```",
            flags=re.DOTALL,
        ),
    ]

    matches: list[tuple[str, str]] = []
    for pattern in patterns:
        matches = pattern.findall(content)
        if matches:
            break

    if matches:
        files: dict[str, str] = {}
        for fname, code in matches:
            fname = fname.strip()
            # Security: prevent path traversal
            if ".." in fname or fname.startswith("/"):
                continue
            # Normalise to flat filenames (strip leading ./ or subdirs for safety)
            fname = fname.replace("\\", "/").split("/")[-1]
            if fname and fname.endswith(".py"):
                files[fname] = code.strip()
        if files:
            # Ensure there is a main.py entry point
            if "main.py" not in files:
                # Pick the first file as main.py
                first_key = next(iter(files))
                files["main.py"] = files.pop(first_key)
            return files

    # Fallback: single code block → main.py
    code = _extract_code_block(content)
    if code.strip():
        return {"main.py": code}
    return {}


def _parse_jsonl_rows(text: str) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        parsed = _safe_json_loads(line, {})
        if isinstance(parsed, dict):
            rows.append(parsed)
    return rows


def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row, ensure_ascii=False) + "\n")


def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
    """Parse metric lines from experiment stdout.

    Handles multiple formats:
    - ``name: value`` (e.g. ``loss: 0.0042``)
    - ``UCB (Stochastic) cumulative_regret: 361.9233``
    - ``condition=name metric=value`` (per-condition output)
    - ``condition=name/metric_name metric=value``

    Returns a flat dict of metric_name -> value.
    Filters out log/status lines using :func:`is_metric_name`.
    """
    # BUG-173: regex for condition=name metric=value format
    _CONDITION_RE = re.compile(
        r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$"
    )
    metrics: dict[str, Any] = {}
    for line in stdout.splitlines():
        line = line.strip()
        # --- Format 2: condition=xxx metric=yyy ---
        m = _CONDITION_RE.match(line)
        if m:
            cond_name = m.group(1)
            try:
                fval = float(m.group(2))
                metrics[cond_name] = fval
            except (ValueError, TypeError):
                pass
            continue
        # --- Format 1: name: value ---
        if ":" not in line:
            continue
        # Split on the LAST colon to handle names with colons
        parts = line.rsplit(":", 1)
        if len(parts) != 2:
            continue
        name_part = parts[0].strip()
        value_part = parts[1].strip()
        # Filter out log lines that look like status messages
        if not is_metric_name(name_part):
            continue
        try:
            fval = float(value_part)
            # Use the full name (e.g. "UCB (Stochastic) cumulative_regret")
            metrics[name_part] = fval
        except (ValueError, TypeError):
            pass
    return metrics


# ---------------------------------------------------------------------------
# LLM helpers
# ---------------------------------------------------------------------------


def _chat_with_prompt(
    llm: LLMClient,
    system: str,
    user: str,
    *,
    json_mode: bool = False,
    max_tokens: int | None = None,
    retries: int = 0,
    strip_thinking: bool = True,
) -> Any:
    """Send a chat request with optional retry on timeout/transient errors.

    Parameters
    ----------
    retries:
        Number of extra attempts after the first failure (0 = no retry).
        Uses exponential backoff: 2s, 4s, 8s, ...
    strip_thinking:
        If True (default for pipeline usage), strip ``<think>`` tags from
        the LLM response.  This prevents chain-of-thought leakage from
        breaking YAML / JSON / LaTeX parsers downstream.
    """
    import time

    messages = [{"role": "user", "content": user}]
    last_exc: Exception | None = None
    for attempt in range(1 + retries):
        try:
            if json_mode and max_tokens is not None:
                return llm.chat(messages, system=system, json_mode=True, max_tokens=max_tokens, strip_thinking=strip_thinking)
            if json_mode:
                return llm.chat(messages, system=system, json_mode=True, strip_thinking=strip_thinking)
            if max_tokens is not None:
                return llm.chat(messages, system=system, max_tokens=max_tokens, strip_thinking=strip_thinking)
            return llm.chat(messages, system=system, strip_thinking=strip_thinking)
        except Exception as exc:  # noqa: BLE001
            last_exc = exc
            if attempt < retries:
                delay = 2 ** (attempt + 1)
                logger.warning(
                    "LLM call failed (attempt %d/%d): %s. Retrying in %ds...",
                    attempt + 1,
                    1 + retries,
                    exc,
                    delay,
                )
                time.sleep(delay)
            else:
                raise last_exc from None
    raise last_exc  # type: ignore[misc]  # unreachable but satisfies type checker


def _get_evolution_overlay(run_dir: Path | None, stage_name: str) -> str:
    """Load evolution lessons + MetaClaw skills for prompt injection.

    Combines intra-run lessons (from current run's evolution dir) with
    cross-run arc-* skills (from ~/.metaclaw/skills/).

    Returns empty string if no relevant lessons/skills exist or on any error.
    """
    if run_dir is None:
        return ""
    try:
        from researchclaw.evolution import EvolutionStore

        store = EvolutionStore(run_dir / "evolution")
        return store.build_overlay(
            stage_name, max_lessons=5, skills_dir=_METACLAW_SKILLS_DIR
        )
    except Exception:  # noqa: BLE001
        return ""


# ---------------------------------------------------------------------------
# Context builders
# ---------------------------------------------------------------------------


def _collect_json_context(
    directory: Path,
    *,
    max_files: int = 30,
    max_total_chars: int = 50_000,
) -> str:
    """Collect JSON context from a directory, with size limits.

    Large fields like ``stderr`` and ``stdout`` are stripped to avoid
    exceeding LLM token limits (the raw experiment output can be 5 MB+).
    """
    chunks: list[str] = []
    total = 0
    for file_path in sorted(directory.glob("*.json"))[:max_files]:
        try:
            data = json.loads(file_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue
        # Strip verbose fields that bloat the context
        if isinstance(data, dict):
            for key in ("stderr", "stdout", "raw_output", "traceback"):
                if key in data and isinstance(data[key], str) and len(data[key]) > 500:
                    data[key] = data[key][:500] + f"\n... [truncated, {len(data[key])} chars total]"
        chunk = json.dumps(data, indent=2, ensure_ascii=False)
        if total + len(chunk) > max_total_chars:
            remaining = max_total_chars - total
            if remaining > 200:
                chunks.append(chunk[:remaining] + "\n... [truncated]")
            break
        chunks.append(chunk)
        total += len(chunk)
    return "\n\n".join(chunks)


def _collect_experiment_results(
    run_dir: Path,
    metric_key: str = "",
    metric_direction: str = "maximize",
) -> dict[str, Any]:
    """Aggregate experiment metrics from runs/ directory across prior stages.

    Returns a dict with ``runs``, ``metrics_summary``, ``best_run``,
    ``latex_table``, and optionally ``structured_results``.
    """
    runs_data: list[dict[str, Any]] = []
    structured_results: Any = None

    # Scan all stage dirs for runs/ subdirectory
    for stage_subdir in sorted(run_dir.glob("stage-*/runs")):
        # Check for structured results.json first
        results_json = stage_subdir / "results.json"
        if results_json.exists() and structured_results is None:
            try:
                structured_results = json.loads(
                    results_json.read_text(encoding="utf-8")
                )
            except (json.JSONDecodeError, OSError):
                pass

        for run_file in sorted(stage_subdir.glob("*.json")):
            if run_file.name == "results.json":
                continue  # Already handled above
            parsed = _safe_json_loads(run_file.read_text(encoding="utf-8"), {})
            if isinstance(parsed, dict) and "metrics" in parsed:
                # Also check for structured_results inside run payload
                if "structured_results" in parsed and structured_results is None:
                    structured_results = parsed["structured_results"]
                runs_data.append(parsed)
            elif isinstance(parsed, dict) and "key_metrics" in parsed:
                # Simulated mode uses key_metrics
                parsed["metrics"] = parsed.pop("key_metrics")
                runs_data.append(parsed)

    if not runs_data:
        result: dict[str, Any] = {"runs": [], "metrics_summary": {}, "best_run": None, "latex_table": ""}
        if structured_results is not None:
            result["structured_results"] = structured_results
        return result

    # Aggregate metrics across runs
    all_metric_keys: set[str] = set()
    for r in runs_data:
        m = r.get("metrics") or {}
        if isinstance(m, dict):
            all_metric_keys.update(m.keys())

    metrics_summary: dict[str, dict[str, float | None]] = {}
    for key in sorted(all_metric_keys):
        values = []
        for r in runs_data:
            m = r.get("metrics") or {}
            if isinstance(m, dict) and key in m:
                try:
                    _fv = float(m[key])
                    if _fv == _fv and abs(_fv) != float("inf"):  # filter NaN/Inf
                        values.append(_fv)
                except (ValueError, TypeError):
                    pass
        if values:
            metrics_summary[key] = {
                "min": round(min(values), 6),
                "max": round(max(values), 6),
                "mean": round(sum(values) / len(values), 6),
                "count": len(values),
            }

    # Find best run using metric_key and metric_direction
    best_run: dict[str, Any] | None = None
    if runs_data:

        def _primary_metric(r: dict[str, Any]) -> float:
            m = r.get("metrics") or {}
            if isinstance(m, dict):
                # Try specific metric_key first
                if metric_key and metric_key in m:
                    try:
                        return float(m[metric_key])
                    except (ValueError, TypeError):
                        pass
                # Fallback to first metric
                for v in m.values():
                    try:
                        return float(v)
                    except (ValueError, TypeError):
                        pass
            return 0.0

        _cmp = min if metric_direction == "minimize" else max
        best_run = _cmp(runs_data, key=_primary_metric)

    # Build LaTeX table
    latex_lines = [
        r"\begin{table}[h]",
        r"\centering",
        r"\caption{Experiment Results}",
    ]
    if metrics_summary:
        cols = sorted(metrics_summary.keys())
        header = "Metric & Min & Max & Mean & N \\\\"
        latex_lines.append(r"\begin{tabular}{l" + "r" * 4 + "}")
        latex_lines.append(r"\hline")
        latex_lines.append(header)
        latex_lines.append(r"\hline")
        for col in cols:
            s = metrics_summary[col]
            row = f"{col} & {s['min']:.4f} & {s['max']:.4f} & {s['mean']:.4f} & {s['count']} \\\\"
            latex_lines.append(row)
        latex_lines.append(r"\hline")
        latex_lines.append(r"\end{tabular}")
    else:
        latex_lines.append(r"\begin{tabular}{l}")
        latex_lines.append("No experiment data available \\\\")
        latex_lines.append(r"\end{tabular}")
    latex_lines.append(r"\end{table}")

    # R18-1: Extract paired statistical comparisons from stdout
    from researchclaw.experiment.sandbox import extract_paired_comparisons

    paired_comparisons: list[dict[str, object]] = []
    for r in runs_data:
        stdout = r.get("stdout", "")
        if stdout:
            paired_comparisons.extend(extract_paired_comparisons(stdout))

    collected: dict[str, Any] = {
        "runs": runs_data,
        "metrics_summary": metrics_summary,
        "best_run": best_run,
        "latex_table": "\n".join(latex_lines),
    }
    if paired_comparisons:
        collected["paired_comparisons"] = paired_comparisons
    if structured_results is not None:
        collected["structured_results"] = structured_results
    return collected


def _build_context_preamble(
    config: RCConfig,
    run_dir: Path,
    *,
    include_goal: bool = False,
    include_hypotheses: bool = False,
    include_synthesis: bool = False,
    include_exp_plan: bool = False,
    include_analysis: bool = False,
    include_decision: bool = False,
    include_experiment_data: bool = False,
) -> str:
    parts = [
        "## Research Context",
        f"**Topic**: {config.research.topic}",
        f"**Domains**: {', '.join(config.research.domains) if config.research.domains else 'general'}",
    ]
    if include_goal:
        goal = _read_prior_artifact(run_dir, "goal.md")
        if goal:
            parts.append(f"\n### Goal\n{goal[:2200]}")
    if include_hypotheses:
        hyp = _read_prior_artifact(run_dir, "hypotheses.md")
        if hyp:
            parts.append(f"\n### Hypotheses\n{hyp[:2200]}")
    if include_synthesis:
        synthesis = _read_prior_artifact(run_dir, "synthesis.md")
        if synthesis:
            parts.append(f"\n### Synthesis\n{synthesis[:2200]}")
    if include_exp_plan:
        plan = _read_prior_artifact(run_dir, "exp_plan.yaml")
        if plan:
            parts.append(f"\n### Experiment Plan\n{plan[:2000]}")
    if include_analysis:
        analysis = _read_best_analysis(run_dir)
        if analysis:
            parts.append(f"\n### Result Analysis\n{analysis[:2500]}")
    if include_decision:
        decision = _read_prior_artifact(run_dir, "decision.md")
        if decision:
            parts.append(f"\n### Research Decision\n{decision[:1500]}")
    if include_experiment_data:
        hw_profile = _load_hardware_profile(run_dir)
        if hw_profile:
            hw_lines = ["### Hardware Environment"]
            for hk, hv in hw_profile.items():
                hw_lines.append(f"- **{hk}**: {hv}")
            parts.append("\n" + "\n".join(hw_lines))
        exp_summary = _read_prior_artifact(run_dir, "experiment_summary.json")
        if exp_summary:
            summary = _safe_json_loads(exp_summary, {})
            if isinstance(summary, dict) and summary.get("metrics_summary"):
                parts.append("\n### Experiment Results (Quantitative)")
                ms = summary["metrics_summary"]
                for mk, mv in ms.items():
                    if isinstance(mv, dict):
                        parts.append(
                            f"- **{mk}**: mean={mv.get('mean', '?')}, "
                            f"min={mv.get('min', '?')}, max={mv.get('max', '?')}, n={mv.get('count', '?')}"
                        )
                if summary.get("latex_table"):
                    parts.append(
                        f"\n### LaTeX Table\n```latex\n{summary['latex_table']}\n```"
                    )
    return "\n".join(parts)


# ---------------------------------------------------------------------------
# Topic keywords and constraints
# ---------------------------------------------------------------------------


def _extract_topic_keywords(
    topic: str, domains: tuple[str, ...] | list[str] = ()
) -> list[str]:
    """Extract meaningful keywords from the research topic + domain list.

    Returns lowercased keyword list (2+ chars, no stop words).
    Used by the domain pre-filter to drop obviously irrelevant papers.
    """
    tokens = re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", topic.lower())
    keywords = [t for t in tokens if t not in _STOP_WORDS and len(t) >= 3]
    # Add domain names as keywords
    for d in domains:
        for part in re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", d.lower()):
            if part not in _STOP_WORDS and len(part) >= 2:
                keywords.append(part)
    # Deduplicate while preserving order
    seen: set[str] = set()
    unique: list[str] = []
    for kw in keywords:
        if kw not in seen:
            seen.add(kw)
            unique.append(kw)
    return unique


# --- P1-2: Topic constraint block for paper generation stages ---
def _topic_constraint_block(topic: str) -> str:
    """Return a hard constraint instruction that anchors paper content to the topic.

    Prevents the common LLM failure mode of drifting off-topic or
    presenting environmental/infrastructure issues as research contributions.
    """
    return (
        "\n\n=== HARD TOPIC CONSTRAINT ===\n"
        f"The paper MUST be about: {topic}\n"
        "PROHIBITED content (unless user explicitly specifies case-study mode):\n"
        "- Do NOT treat environment setup, dependency installation, or infrastructure "
        "failures as a research contribution.\n"
        "- Do NOT present debugging logs, system errors, or configuration issues "
        "as experimental findings.\n"
        "- Do NOT drift to tangential topics not directly related to the stated topic.\n"
        "- Every section MUST connect back to the core research question.\n"
        "- The Abstract and Introduction MUST clearly state the research problem "
        f"derived from: {topic}\n"
        "- The Method section MUST describe a technical approach, not a workflow.\n"
        "- The Results section MUST report quantitative outcomes of experiments, "
        "not environment status.\n"
        "=== END CONSTRAINT ===\n"
    )


# ---------------------------------------------------------------------------
# Runtime issue detection
# ---------------------------------------------------------------------------


def _detect_runtime_issues(sandbox_result: Any) -> str:
    """Detect NaN/Inf in metrics and extract stderr warnings from sandbox run.

    Returns a formatted string describing all runtime issues, or empty string
    if no issues are found.
    """
    issues: list[str] = []

    # Check metrics for NaN/Inf
    metrics = getattr(sandbox_result, "metrics", {}) or {}
    for key, val in metrics.items():
        try:
            fval = float(val)
            if math.isnan(fval):
                issues.append(f"METRIC NaN: '{key}' returned NaN — likely a division by zero or invalid computation in code")
            elif math.isinf(fval):
                issues.append(f"METRIC Inf: '{key}' returned Infinity — likely overflow or unbounded computation")
        except (TypeError, ValueError):
            pass

    # Check stdout for NaN values (word boundary to avoid matching "Nanotechnology" etc.)
    stdout = getattr(sandbox_result, "stdout", "") or ""
    _nan_re = re.compile(r"\bnan\b", re.IGNORECASE)
    if _nan_re.search(stdout):
        nan_lines = [
            line.strip()
            for line in stdout.splitlines()
            if _nan_re.search(line)
        ]
        if nan_lines:
            issues.append(
                f"NaN values detected in output:\n" + "\n".join(nan_lines[:10])
            )

    # Extract meaningful warnings from stderr
    stderr = getattr(sandbox_result, "stderr", "") or ""
    if stderr.strip():
        warning_lines = []
        for line in stderr.splitlines():
            line_stripped = line.strip()
            if not line_stripped:
                continue
            # Keep RuntimeWarning, ValueError, ZeroDivisionError, etc.
            if any(
                kw in line_stripped
                for kw in (
                    "Warning",
                    "Error",
                    "Traceback",
                    "Exception",
                    "divide",
                    "overflow",
                    "invalid value",
                    "NaN",
                    "inf",
                )
            ):
                warning_lines.append(line_stripped)
        if warning_lines:
            issues.append(
                "Runtime warnings/errors from stderr:\n"
                + "\n".join(warning_lines[:15])
            )

    # Check for identical metric values across all entries in stdout
    # (e.g., all algorithms reporting convergence_rate=1.0)
    stdout = getattr(sandbox_result, "stdout", "") or ""
    if stdout:
        from collections import Counter

        metric_values_by_name: dict[str, list[float]] = {}
        for line in stdout.splitlines():
            line = line.strip()
            if ":" not in line:
                continue
            parts = line.rsplit(":", 1)
            if len(parts) != 2:
                continue
            try:
                fval = float(parts[1].strip())
            except (ValueError, TypeError):
                continue
            # Extract metric suffix (e.g. "convergence_rate" from "UCB (Stochastic) convergence_rate")
            name = parts[0].strip()
            metric_suffix = name.split()[-1] if name.split() else name
            metric_values_by_name.setdefault(metric_suffix, []).append(fval)

        for metric_name, vals in metric_values_by_name.items():
            if len(vals) >= 3:
                unique = set(vals)
                if len(unique) <= 2:
                    issues.append(
                        f"DUMMY METRIC: '{metric_name}' has only {len(unique)} unique value(s) "
                        f"across {len(vals)} entries ({unique}) — likely a placeholder. "
                        f"Implement real measurement logic (e.g., track iterations to convergence)."
                    )

    # R5-3: Check for diverging loss values (fast-fail indicator)
    for key, val in metrics.items():
        try:
            fval = float(val)
            if "loss" in key.lower() and fval > 100:
                issues.append(
                    f"DIVERGING LOSS: '{key}' = {fval} (>100) — the optimization is "
                    f"diverging. Reduce learning rate, check gradient computation, "
                    f"or add gradient clipping."
                )
        except (TypeError, ValueError):
            pass

    if not issues:
        return ""

    return (
        "## Runtime Issues Detected\n\n"
        "The experiment code ran but produced problematic results. "
        "Fix the ROOT CAUSE of these issues in the code:\n\n"
        + "\n\n".join(f"- {issue}" for issue in issues)
    )


# ---------------------------------------------------------------------------
# NeurIPS checklist
# ---------------------------------------------------------------------------


def _generate_neurips_checklist(
    has_experiments: bool = True,
    has_theory: bool = False,
    has_code: bool = True,
) -> str:
    """Generate a NeurIPS-style paper checklist appendix in markdown.

    This checklist is based on the NeurIPS 2025 submission requirements.
    It is appended to the paper before LaTeX conversion.
    """
    items = [
        ("Claims", "Do the main claims accurately reflect the paper's contributions and scope?", "Yes"),
        ("Limitations", "Does the paper discuss limitations of the work?", "Yes"),
    ]
    if has_theory:
        items.append(
            ("Theory", "Are all assumptions stated and proofs included?", "Yes")
        )
    items.extend([
        ("Experiments reproducibility", "Does the paper fully disclose experimental settings?", "Yes" if has_experiments else "NA"),
        ("Code and data", "Is code or data provided for reproducibility?", "Yes" if has_code else "No"),
        ("Experimental details", "Are training details and hyperparameters specified?", "Yes" if has_experiments else "NA"),
        ("Error bars", "Are error bars or confidence intervals reported?", "Yes" if has_experiments else "NA"),
        ("Compute resources", "Are compute requirements documented?", "Yes" if has_experiments else "NA"),
        ("Code of ethics", "Does the work comply with the code of ethics?", "Yes"),
        ("Broader impacts", "Are potential negative societal impacts discussed?", "Yes"),
        ("Licenses", "Are licenses for used assets respected?", "Yes"),
        ("New assets", "Are newly released assets documented?", "NA"),
        ("Human subjects", "Were IRB approvals obtained if applicable?", "NA"),
    ])

    lines = [
        "## NeurIPS Paper Checklist",
        "",
    ]
    for label, question, answer in items:
        lines.append(f"**{label}**: {question}")
        lines.append(f"Answer: [{answer}]")
        lines.append("")

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Paper title extraction
# ---------------------------------------------------------------------------


def _extract_paper_title(md_text: str) -> str:
    """Extract paper title from markdown text for LaTeX generation.

    Prioritises H1 headings that appear *before* the abstract section and
    look like real titles (>= 4 words, starts with uppercase).  This avoids
    picking up pseudocode comments or algorithm step labels.

    Also handles the common LLM pattern where a ``# Title`` heading is
    followed by the actual title as a plain text line (possibly bold):

        # Title

        NORM-PPO: Observation Normalization and Reward Scaling Effects
    """
    import re as _re

    # Strip outer markdown fence (LLMs sometimes wrap entire paper)
    _text = md_text
    _fence_m = _re.match(r"^\s*```(?:markdown|md|latex|tex)?\s*\n", _text)
    if _fence_m:
        _text = _text[_fence_m.end():]
        # Also strip trailing fence
        _text = _re.sub(r"\n\s*```\s*$", "", _text)

    # Limit search to content before Abstract heading
    abstract_pos = _re.search(
        r"^#{1,2}\s+(Abstract|ABSTRACT)", _text, _re.MULTILINE
    )
    search_region = _text[: abstract_pos.start()] if abstract_pos else _text[:3000]

    _SKIP = {"title", "abstract", "references", "appendix"}
    candidates: list[str] = []
    _saw_title_heading = False

    lines = search_region.splitlines()
    for i, raw_line in enumerate(lines):
        line = raw_line.strip()

        # BUG-171: When we see a "# Title" or "## Title" heading, the actual
        # title is often on the next non-empty line as plain text or bold text.
        if _saw_title_heading and line:
            # Strip bold markers: **Title Text** → Title Text
            candidate = _re.sub(r"\*\*(.+?)\*\*", r"\1", line).strip()
            # Make sure it's not another heading or a skip heading
            if not line.startswith("#") and candidate:
                candidates.insert(0, candidate)  # highest priority
            _saw_title_heading = False

        # Match H1 or H2 headings
        hm = _re.match(r"^(#{1,2})\s+(.+)$", line)
        if hm:
            heading = hm.group(2).strip()
            heading_lower = heading.lower()
            # Handle "## Title Actual Paper Title" pattern
            if heading_lower.startswith("title ") and len(heading) > 6:
                heading = heading[6:].strip()
                heading_lower = heading.lower()
            if heading_lower in _SKIP:
                # Mark that we saw a "# Title" heading — next non-empty line
                # is likely the actual title text
                if heading_lower == "title":
                    _saw_title_heading = True
                continue
            candidates.append(heading)
            continue
        # Bold title line (e.g. **My Paper Title**)
        m = _re.match(r"\*\*(.+?)\*\*$", line)
        if m and len(m.group(1).split()) >= 3:
            candidates.append(m.group(1))

    # Prefer candidates that look like real titles (>= 4 words, capitalised)
    for c in candidates:
        words = c.split()
        if len(words) >= 4 and c[0].isupper():
            return c

    # Fallback: any candidate
    if candidates:
        return candidates[0]

    return "Untitled Paper"


# ---------------------------------------------------------------------------
# Framework diagram prompt
# ---------------------------------------------------------------------------


def _generate_framework_diagram_prompt(
    paper_text: str,
    config: "RCConfig",
    *,
    llm: "LLMClient | None" = None,
) -> str:
    """Generate a text-to-image prompt for a methodology framework diagram.

    Reads the paper's method section and produces a detailed prompt suitable
    for AI image generators (DALL-E, Midjourney, etc.).  The prompt describes
    an academic-style architecture/framework overview figure.

    Returns the prompt as a Markdown string, or empty string on failure.
    """
    import re as _re

    # Extract method/approach section from paper
    _method_section = ""
    _method_patterns = [
        r"(?:^#{1,3}\s+(?:Method(?:ology)?|Approach|Proposed\s+(?:Method|Framework|Approach)|Our\s+Method|Technical\s+Approach|Model\s+Architecture).*?)(?=^#{1,3}\s+|\Z)",
    ]
    for _pat in _method_patterns:
        _match = _re.search(_pat, paper_text, _re.MULTILINE | _re.DOTALL | _re.IGNORECASE)
        if _match:
            _method_section = _match.group(0)[:3000]
            break

    if not _method_section:
        # Fallback: use abstract + first 1500 chars
        _abs_match = _re.search(
            r"(?:^#{1,2}\s+Abstract\s*\n)(.*?)(?=^#{1,2}\s+|\Z)",
            paper_text, _re.MULTILINE | _re.DOTALL | _re.IGNORECASE,
        )
        _method_section = (_abs_match.group(1)[:1500] if _abs_match else paper_text[:2000])

    title = _extract_paper_title(paper_text)
    topic = config.research.topic

    # Use LLM to generate the prompt if available
    if llm is not None:
        _system = (
            "You are an expert academic figure designer. Generate a detailed text-to-image "
            "prompt for creating a methodology framework/architecture overview diagram.\n\n"
            "Requirements:\n"
            "- Academic style: clean, professional, suitable for a top-tier ML conference paper\n"
            "- Color palette: sophisticated and harmonious (suggest specific hex colors, "
            "prefer muted blues #4477AA, teals #44AA99, warm accents #CCBB44, soft purples #AA3377)\n"
            "- Layout: left-to-right or top-to-bottom data flow, with clearly labeled components\n"
            "- Components: boxes/modules with rounded corners, directional arrows, clear labels\n"
            "- Information density: high but not cluttered — each box should have a short label\n"
            "- Text on figure: minimal, only component names and key annotations\n"
            "- Background: white or very light grey\n"
            "- Style: vector-art look, flat design with subtle shadows, NO photorealism\n\n"
            "Output ONLY the prompt text (no markdown headers, no explanations). "
            "The prompt should be 150-300 words, highly specific and actionable."
        )
        _user = (
            f"Paper title: {title}\n"
            f"Research topic: {topic}\n\n"
            f"Method section excerpt:\n{_method_section}\n\n"
            "Generate a detailed text-to-image prompt for the methodology framework diagram."
        )
        try:
            resp = _chat_with_prompt(llm, _system, _user, max_tokens=1024)
            _llm_prompt = resp.content.strip()
            if len(_llm_prompt) > 50:
                return (
                    f"# Framework Diagram Prompt\n\n"
                    f"**Paper**: {title}\n\n"
                    f"## Image Generation Prompt\n\n"
                    f"{_llm_prompt}\n\n"
                    f"## Usage Instructions\n\n"
                    f"1. Copy the prompt above into an AI image generator "
                    f"(DALL-E 3, Midjourney, Ideogram, etc.)\n"
                    f"2. Generate the image at high resolution (2048x1024 or similar landscape)\n"
                    f"3. Save as `framework_diagram.png` in the same `charts/` folder\n"
                    f"4. Insert into the paper's Method section using:\n"
                    f"   - LaTeX: `\\includegraphics[width=\\textwidth]{{charts/framework_diagram.png}}`\n"
                    f"   - Markdown: `![Framework Overview](charts/framework_diagram.png)`\n"
                )
        except Exception:
            logger.debug("Framework prompt LLM generation failed, using template")

    # Fallback: template-based prompt without LLM
    _components = []
    _component_patterns = [
        (r"(?:encoder|decoder|transformer|attention|convolution|MLP|GNN|ResNet|ViT)", "Neural Network Module"),
        (r"(?:loss|objective|criterion|training|optimization)", "Training/Optimization"),
        (r"(?:data|dataset|input|preprocessing|augmentation)", "Data Pipeline"),
        (r"(?:output|prediction|inference|evaluation)", "Output/Evaluation"),
    ]
    _method_lower = _method_section.lower()
    for pat, label in _component_patterns:
        if _re.search(pat, _method_lower):
            _components.append(label)

    if not _components:
        _components = ["Input Processing", "Core Model", "Training Loop", "Evaluation"]

    return (
        f"# Framework Diagram Prompt\n\n"
        f"**Paper**: {title}\n\n"
        f"## Image Generation Prompt\n\n"
        f"Create a clean, academic-style methodology framework diagram for a research paper "
        f"titled \"{title}\". "
        f"The diagram should show a left-to-right data flow pipeline with these main components: "
        f"{', '.join(_components)}. "
        f"Use a professional color palette with muted blues (#4477AA), teals (#44AA99), "
        f"warm yellows (#CCBB44), and soft purples (#AA3377) on a white background. "
        f"Each component should be a rounded rectangle with a short label inside. "
        f"Connect components with clean directional arrows. "
        f"Add subtle shadows for depth. Flat vector-art style, no photorealism. "
        f"High information density but visually clean. "
        f"Suitable for a top-tier machine learning conference paper (ICML/NeurIPS/ICLR). "
        f"Landscape orientation, 2048x1024 resolution.\n\n"
        f"## Usage Instructions\n\n"
        f"1. Copy the prompt above into an AI image generator "
        f"(DALL-E 3, Midjourney, Ideogram, etc.)\n"
        f"2. Generate the image at high resolution (2048x1024 or similar landscape)\n"
        f"3. Save as `framework_diagram.png` in the same `charts/` folder\n"
        f"4. Insert into the paper's Method section using:\n"
        f"   - LaTeX: `\\includegraphics[width=\\textwidth]{{charts/framework_diagram.png}}`\n"
        f"   - Markdown: `![Framework Overview](charts/framework_diagram.png)`\n"
    )


# ---------------------------------------------------------------------------
# Filename and data helpers
# ---------------------------------------------------------------------------


def _safe_filename(name: str) -> str:
    name = name.replace("/", "_").replace("\\", "_").replace("..", "_")
    name = re.sub(r"[^a-zA-Z0-9_\-.]", "_", name)
    return name[:100] or "unnamed"


# ---------------------------------------------------------------------------
# Default fallbacks
# ---------------------------------------------------------------------------


def _default_hypotheses(topic: str) -> str:
    return f"""# Hypotheses

## H1
Increasing protocol control for {topic} improves metric stability across random seeds.

## H2
Adding robustness-aware objectives for {topic} improves out-of-domain performance without major in-domain regression.

## H3
The combined approach outperforms either component under fixed compute budget.

## Generated
{_utcnow_iso()}
"""


def _default_paper_outline(topic: str) -> str:
    return f"""# Paper Outline

## 1. Title
Focused title on {topic}

## 2. Abstract
- Problem framing
- Method overview
- Key quantitative result

## 3. Introduction
- Motivation
- Gap statement
- Contributions

## 4. Related Work
- Method families
- Evaluation practices

## 5. Method
- Problem setup
- Model/algorithm
- Complexity and constraints

## 6. Experiments
- Datasets and metrics
- Baselines and ablations
- Reproducibility protocol

## 7. Results
- Main table
- Robustness analysis
- Failure cases

## 8. Discussion
- Practical implications
- Limitations

## 9. Conclusion
- Findings and next steps

Generated: {_utcnow_iso()}
"""


def _default_quality_report(threshold: float) -> dict[str, Any]:
    # When LLM fails, return below-threshold score to force revision
    score = max(1.0, float(threshold) - 2.0) if threshold > 0 else 5.0
    score = max(1.0, min(10.0, score))
    verdict = "revise"
    return {
        "score_1_to_10": round(score, 2),
        "verdict": verdict,
        "criteria": {
            "novelty": round(min(10.0, score + 0.3), 2),
            "methodological_rigor": round(score, 2),
            "clarity": round(max(1.0, score - 0.2), 2),
            "reproducibility": round(min(10.0, score + 0.1), 2),
        },
        "strengths": [
            "Stage-by-stage evidence chain preserved",
            "Experiment artifacts are generated and archived",
        ],
        "weaknesses": [
            "Statistical significance may need stronger reporting",
            "Broader external validity remains partially evaluated",
        ],
        "required_actions": [
            "Report confidence intervals and seed variance",
            "Include at least one stronger external baseline",
        ],
        "generated": _utcnow_iso(),
    }


# ---------------------------------------------------------------------------
# Multi-perspective generation
# ---------------------------------------------------------------------------


def _multi_perspective_generate(
    llm: LLMClient,
    roles: dict[str, dict[str, str]],
    variables: dict[str, str],
    perspectives_dir: Path,
) -> dict[str, str]:
    """Generate outputs from multiple debate perspectives.

    Each role has its own system/user prompt. Outputs are saved to
    *perspectives_dir* and returned as ``{role_name: response_text}``.
    """
    from researchclaw.prompts import _render  # noqa: PLC0415

    perspectives_dir.mkdir(parents=True, exist_ok=True)
    results: dict[str, str] = {}
    for role_name, role_prompts in roles.items():
        try:
            system = _render(role_prompts["system"], variables)
            user = _render(role_prompts["user"], variables)
            resp = llm.chat(
                [{"role": "user", "content": user}],
                system=system,
            )
            results[role_name] = resp.content
            (perspectives_dir / f"{role_name}.md").write_text(
                resp.content, encoding="utf-8"
            )
            logger.info("Debate perspective '%s' generated (%d chars)", role_name, len(resp.content))
        except Exception as exc:  # noqa: BLE001
            logger.warning("Debate perspective '%s' failed: %s", role_name, exc)
    if len(results) < 2:
        logger.error("Multi-perspective debate: only %d/%d roles succeeded", len(results), len(roles))
    return results


def _synthesize_perspectives(
    llm: LLMClient,
    perspectives: dict[str, str],
    sub_prompt_name: str,
    prompts: PromptManager,
) -> str:
    """Synthesize multiple perspective outputs into a unified result."""
    parts = []
    for role_name, text in perspectives.items():
        parts.append(f"### Perspective: {role_name}\n{text}")
    combined = "\n\n---\n\n".join(parts)
    sp = prompts.sub_prompt(sub_prompt_name, perspectives=combined)
    resp = llm.chat(
        [{"role": "user", "content": sp.user}],
        system=sp.system,
    )
    return resp.content


def reconcile_figure_refs(
    tex_path: Path,
    charts_dir: Path,
) -> dict[str, str]:
    """Fix ``\\includegraphics`` paths in *tex_path* that don't match files in *charts_dir*.

    Three-tier matching strategy:
      1. **Exact stem** — e.g. ``accuracy_plot`` matches ``accuracy_plot.png``
      2. **Normalized keyword overlap** — tokenize on ``[-_]``, apply singular/plural
         normalization, require Jaccard similarity >= 0.4
      3. **Substring containment** — one stem is a substring of the other

    Returns a ``{old_path: new_path}`` dict of fixes applied (empty if none needed).
    """
    if not tex_path.exists():
        return {}

    tex_text = tex_path.read_text(encoding="utf-8")
    fig_refs = re.findall(
        r"\\includegraphics(?:\[[^\]]*\])?\{([^}]+)\}", tex_text
    )
    if not fig_refs:
        return {}

    # Build map of actual chart files: lowered-stem -> charts/filename
    actual_files: dict[str, str] = {}
    if charts_dir.is_dir():
        for af in charts_dir.iterdir():
            if af.is_file() and af.suffix.lower() in (
                ".png", ".jpg", ".jpeg", ".pdf", ".svg",
            ):
                actual_files[af.stem.lower()] = f"charts/{af.name}"

    if not actual_files:
        return {}

    def _singularize(word: str) -> str:
        """Cheap singular/plural normalization."""
        if word.endswith("ies") and len(word) > 4:
            return word[:-3] + "y"
        if word.endswith("ses") and len(word) > 4:
            return word[:-2]
        if word.endswith("s") and not word.endswith("ss") and len(word) > 2:
            return word[:-1]
        return word

    def _tokenize(stem: str) -> set[str]:
        return {_singularize(w) for w in stem.replace("-", "_").split("_") if w}

    def _jaccard(a: set[str], b: set[str]) -> float:
        if not a or not b:
            return 0.0
        return len(a & b) / len(a | b)

    fixes: dict[str, str] = {}
    for ref in fig_refs:
        ref_resolved = tex_path.parent / ref
        if ref_resolved.exists():
            continue

        ref_stem = Path(ref).stem.lower()

        # Tier 1: exact stem match
        if ref_stem in actual_files:
            fixes[ref] = actual_files[ref_stem]
            continue

        # Tier 2: keyword overlap with Jaccard >= 0.4
        ref_tokens = _tokenize(ref_stem)
        best_match, best_score = "", 0.0
        for stem, apath in actual_files.items():
            score = _jaccard(ref_tokens, _tokenize(stem))
            if score > best_score:
                best_score = score
                best_match = apath
        if best_score >= 0.4 and best_match:
            fixes[ref] = best_match
            continue

        # Tier 3: substring containment
        for stem, apath in actual_files.items():
            if ref_stem in stem or stem in ref_stem:
                fixes[ref] = apath
                break

    if fixes:
        for old_path, new_path in fixes.items():
            tex_text = tex_text.replace(f"{{{old_path}}}", f"{{{new_path}}}")
        tex_path.write_text(tex_text, encoding="utf-8")
        logger.warning(
            "reconcile_figure_refs: Fixed %d figure path mismatch(es): %s",
            len(fixes),
            ", ".join(f"{k} → {v}" for k, v in fixes.items()),
        )

    return fixes


================================================
FILE: researchclaw/pipeline/code_agent.py
================================================
"""Advanced multi-phase code generation agent.

Phases
------
1. **Blueprint Planning** — produce deep implementation blueprint with
   per-file pseudocode, tensor shapes, and generation ordering.
2. **Sequential File Generation** — generate files one-by-one following
   the dependency order from the blueprint, with CodeMem summaries.
   Falls back to single-shot generation if blueprint parsing fails.
3. **Execution-in-the-Loop** — run in sandbox, feed errors back for repair.
4. **Solution Tree Search** — explore multiple candidate implementations,
   evaluate via sandbox, select the best (optional, higher cost).
5. **Multi-Agent Review** — coder-reviewer dialog for quality assurance.

Integration
-----------
``CodeAgent`` is instantiated inside ``_execute_code_generation`` in
``executor.py`` when ``config.experiment.code_agent.enabled`` is True.
It receives the same inputs (topic, exp_plan, metric, pkg_hint) and
returns ``CodeAgentResult`` with the generated files.
"""

from __future__ import annotations

import ast
import json
import logging
import re
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Protocol

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class CodeAgentConfig:
    """Configuration for the advanced code generation agent.

    All phases are independently toggleable.  The default profile enables
    Phases 1 (blueprint), 2 (sequential generation + exec-fix), and 5
    (review), which gives a large quality boost at moderate extra cost.
    Phase 4 (tree search) is opt-in because it multiplies both LLM and
    sandbox usage.
    """

    enabled: bool = True

    # Phase 1: Blueprint planning (deep implementation blueprint)
    architecture_planning: bool = True

    # Phase 2: Sequential file generation (generate files one-by-one
    # following dependency order from blueprint, with CodeMem summaries)
    sequential_generation: bool = True

    # Phase 2.5: Hard validation gates (AST-based)
    hard_validation: bool = True
    hard_validation_max_repairs: int = 4

    # Phase 3: Execution-in-the-loop
    exec_fix_max_iterations: int = 3
    exec_fix_timeout_sec: int = 60

    # Phase 4: Solution tree search (off by default)
    tree_search_enabled: bool = False
    tree_search_candidates: int = 3
    tree_search_max_depth: int = 2
    tree_search_eval_timeout_sec: int = 120

    # Phase 5: Multi-agent review dialog
    review_max_rounds: int = 2


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------


@dataclass
class SolutionNode:
    """One candidate solution in the search tree."""

    node_id: str
    files: dict[str, str]
    parent_id: str | None = None
    depth: int = 0
    # Evaluation
    runs_ok: bool = False
    returncode: int = -1
    evaluated: bool = False
    stdout: str = ""
    stderr: str = ""
    metrics: dict[str, Any] = field(default_factory=dict)
    score: float = 0.0
    generation_method: str = "initial"


@dataclass
class CodeAgentResult:
    """Final output from the code agent."""

    files: dict[str, str]
    architecture_spec: str = ""
    validation_log: list[str] = field(default_factory=list)
    total_llm_calls: int = 0
    total_sandbox_runs: int = 0
    best_score: float = 0.0
    tree_nodes_explored: int = 0
    review_rounds: int = 0


# ---------------------------------------------------------------------------
# Sandbox protocol (structural typing — no import dependency)
# ---------------------------------------------------------------------------


class _SandboxResult(Protocol):  # pragma: no cover
    returncode: int
    stdout: str
    stderr: str
    elapsed_sec: float
    metrics: dict[str, object]
    timed_out: bool


class _SandboxLike(Protocol):  # pragma: no cover
    def run_project(
        self,
        project_dir: Path,
        *,
        entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> Any: ...


# ---------------------------------------------------------------------------
# CodeAgent
# ---------------------------------------------------------------------------


class CodeAgent:
    """Multi-phase code generation agent.

    Parameters
    ----------
    llm : LLMClient
        The language model client to use for code generation.
    prompts : PromptManager
        Manages prompt templates.
    config : CodeAgentConfig
        Agent configuration (toggles, limits, timeouts).
    stage_dir : Path
        Working directory for this stage (e.g. ``run_dir/stage-10``).
    sandbox_factory : callable, optional
        ``(ExperimentConfig, Path) -> SandboxLike``.  Required for
        Phases 2 and 3.
    experiment_config : ExperimentConfig, optional
        Passed to ``sandbox_factory`` when creating sandboxes.
    """

    def __init__(
        self,
        llm: Any,
        prompts: Any,
        config: CodeAgentConfig,
        stage_dir: Path,
        sandbox_factory: Any | None = None,
        experiment_config: Any | None = None,
        domain_profile: Any | None = None,
        code_search_result: Any | None = None,
    ) -> None:
        self._llm = llm
        self._pm = prompts
        self._cfg = config
        self._stage_dir = stage_dir
        self._sandbox_factory = sandbox_factory
        self._exp_config = experiment_config
        self._domain_profile = domain_profile
        self._code_search_result = code_search_result
        self._calls = 0
        self._runs = 0
        self._log: list[str] = []
        self._sandbox: _SandboxLike | None = None

    # ── Public API ────────────────────────────────────────────────────────

    def generate(
        self,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        max_tokens: int = 8192,
    ) -> CodeAgentResult:
        """Execute all enabled phases and return generated files."""
        t0 = time.time()
        self._log_event("CodeAgent.generate() started")

        # Phase 1: Blueprint planning
        arch_spec = ""
        blueprint = None
        if self._cfg.architecture_planning:
            arch_spec, blueprint = self._phase1_blueprint(
                topic, exp_plan, metric,
            )

        # Phase 2: Code generation
        nodes_explored = 0
        if self._cfg.tree_search_enabled and self._sandbox_factory:
            best, nodes_explored = self._phase3_tree_search(
                topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens,
            )
        elif (
            self._cfg.sequential_generation
            and blueprint is not None
            and self._is_valid_blueprint(blueprint)
        ):
            # Sequential file generation following blueprint
            files = self._phase2_sequential_generate(
                topic, exp_plan, metric, pkg_hint, arch_spec, blueprint,
            )
            # Hard validation gates (E-03)
            if self._cfg.hard_validation:
                files = self._hard_validate_and_repair(
                    files, topic, exp_plan, metric, pkg_hint, arch_spec,
                )
            # Exec-fix loop
            files = self._exec_fix_loop(files)
            best = SolutionNode(
                node_id="sequential", files=files, runs_ok=True, score=1.0,
            )
        else:
            # Fallback: single-shot generation
            if self._cfg.sequential_generation and blueprint is None:
                self._log_event(
                    "  Sequential generation requested but blueprint "
                    "invalid — falling back to single-shot"
                )
            files = self._phase2_generate_and_fix(
                topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens,
            )
            # Hard validation gates (E-03) for single-shot too
            if self._cfg.hard_validation and files:
                files = self._hard_validate_and_repair(
                    files, topic, exp_plan, metric, pkg_hint, arch_spec,
                )
            best = SolutionNode(
                node_id="single", files=files,
                runs_ok=bool(files), score=1.0 if files else 0.0,
            )

        # Phase 5: Review dialog
        review_rounds = 0
        if self._cfg.review_max_rounds > 0:
            best.files, review_rounds = self._phase4_review(
                best.files, topic, exp_plan, metric,
            )

        elapsed = time.time() - t0
        self._log_event(
            f"CodeAgent.generate() done in {elapsed:.1f}s — "
            f"{self._calls} LLM calls, {self._runs} sandbox runs"
        )

        return CodeAgentResult(
            files=best.files,
            architecture_spec=arch_spec,
            validation_log=list(self._log),
            total_llm_calls=self._calls,
            total_sandbox_runs=self._runs,
            best_score=best.score,
            tree_nodes_explored=nodes_explored,
            review_rounds=review_rounds,
        )

    # ── Phase 1: Blueprint Planning ──────────────────────────────────────

    def _phase1_blueprint(
        self, topic: str, exp_plan: str, metric: str,
    ) -> tuple[str, dict[str, Any] | None]:
        """Generate a deep implementation blueprint.

        Returns (raw_yaml_str, parsed_blueprint_dict_or_None).
        """
        self._log_event("Phase 1: Blueprint planning")

        sp = self._pm.sub_prompt(
            "architecture_planning",
            topic=topic,
            exp_plan=exp_plan,
            metric=metric,
        )

        # Inject domain context and code search results into blueprint prompt
        domain_context = self._build_domain_context()
        if domain_context:
            sp = type(sp)(
                system=sp.system,
                user=sp.user + "\n\n" + domain_context,
            )
            self._log_event("  Injected domain context into blueprint prompt")

        resp = self._chat(sp.system, sp.user, max_tokens=8192)

        # Extract YAML block from response
        arch_spec = resp.content
        yaml_match = re.search(r"```ya?ml\s*\n(.*?)```", arch_spec, re.DOTALL)
        if yaml_match:
            arch_spec = yaml_match.group(1).strip()

        self._log_event(f"  Blueprint spec: {len(arch_spec)} chars")

        # Parse YAML into structured blueprint
        blueprint = self._parse_blueprint(arch_spec)
        if blueprint:
            n_files = len(blueprint.get("files", []))
            self._log_event(f"  Parsed blueprint: {n_files} files")
        else:
            self._log_event("  WARNING: Could not parse blueprint YAML")

        return arch_spec, blueprint

    def _build_domain_context(self) -> str:
        """Build domain-specific context for injection into prompts.

        Includes:
        - Domain profile hints (file structure, libraries, evaluation)
        - Code search results (API patterns, reference code)
        """
        parts: list[str] = []

        # Domain profile context
        if self._domain_profile is not None:
            try:
                from researchclaw.domains.prompt_adapter import get_adapter
                adapter = get_adapter(self._domain_profile)
                blueprint_ctx = adapter.get_blueprint_context()
                if blueprint_ctx:
                    parts.append(
                        "# Domain-Specific Guidance\n" + blueprint_ctx
                    )
            except Exception:
                logger.debug("Failed to get domain context", exc_info=True)

        # Code search results
        if self._code_search_result is not None:
            try:
                prompt_ctx = self._code_search_result.to_prompt_context()
                if prompt_ctx:
                    parts.append(
                        "# Reference Code from GitHub\n"
                        "The following patterns were found in relevant open-source projects. "
                        "Use them as reference for API usage and project structure.\n\n"
                        + prompt_ctx
                    )
            except Exception:
                logger.debug("Failed to get code search context", exc_info=True)

        return "\n\n".join(parts)

    def _parse_blueprint(self, yaml_text: str) -> dict[str, Any] | None:
        """Parse blueprint YAML into a structured dict.

        BUG-178: LLM often includes Python type annotations in signature
        values (e.g. ``signature: (self, name: str) -> Config``).  The
        bare ``:`` breaks YAML parsing.  We quote unquoted signature
        values before parsing.
        """
        import yaml

        # Pre-process: sanitize values that contain Python type annotations,
        # unclosed quotes, or other patterns that break YAML parsing.
        import re as _bp_re
        sanitized_lines = []
        for line in yaml_text.split("\n"):
            stripped = line.lstrip()
            if not stripped or stripped.startswith("#"):
                sanitized_lines.append(line)
                continue

            # Skip lines that are pure list markers or block scalars
            if stripped.startswith(("- ", "---", "...")):
                # For list items like `- key: value`, extract after `- `
                if stripped.startswith("- ") and ":" in stripped[2:]:
                    inner = stripped[2:]
                else:
                    sanitized_lines.append(line)
                    continue
            elif ":" in stripped:
                inner = stripped
            else:
                sanitized_lines.append(line)
                continue

            # Find the YAML key separator (first `:` followed by space or EOL)
            m = _bp_re.search(r":\s", inner)
            if not m:
                sanitized_lines.append(line)
                continue

            val_part = inner[m.end():].strip()
            if not val_part:
                sanitized_lines.append(line)
                continue

            # Already properly quoted — skip
            if val_part.startswith(("'", "|", ">")):
                sanitized_lines.append(line)
                continue

            # Check if value needs quoting:
            # 1) Contains `:` or `->` (type annotations)
            # 2) Starts with `"` but doesn't end with `"` (unclosed quote)
            # 3) Contains `[` with `:` (e.g. dict[str, float])
            needs_quoting = False
            if val_part.startswith('"'):
                # Already quoted — check if properly closed
                if not val_part.endswith('"') or val_part.count('"') % 2 != 0:
                    needs_quoting = True  # unclosed or malformed quote
                else:
                    sanitized_lines.append(line)
                    continue
            elif ":" in val_part or "->" in val_part:
                needs_quoting = True

            if needs_quoting:
                # Strip any existing partial quotes, escape internal quotes
                clean = val_part.strip('"').replace('"', '\\"')
                # Remove inline comments (# ...) to avoid YAML issues
                comment_idx = clean.find("  #")
                if comment_idx >= 0:
                    clean = clean[:comment_idx].rstrip()
                indent = line[:len(line) - len(stripped)]
                prefix = stripped[:len(stripped) - len(inner)]  # e.g. "- "
                key_sep = inner[:m.end()]
                sanitized_lines.append(
                    f'{indent}{prefix}{key_sep}"{clean}"'
                )
            else:
                sanitized_lines.append(line)
        sanitized = "\n".join(sanitized_lines)

        for attempt_text in (sanitized, yaml_text):
            try:
                data = yaml.safe_load(attempt_text)
                if isinstance(data, dict) and "files" in data:
                    return data
            except Exception as exc:
                self._log_event(f"  Blueprint YAML parse error: {exc}")
        return None

    @staticmethod
    def _is_valid_blueprint(blueprint: dict[str, Any]) -> bool:
        """Check if a blueprint has the minimum required structure."""
        files = blueprint.get("files", [])
        if not files or not isinstance(files, list):
            return False
        # Need at least 2 files with generation_order
        has_order = sum(
            1 for f in files
            if isinstance(f, dict) and "generation_order" in f
        )
        return has_order >= 2

    # ── Phase 2a: Sequential File Generation ─────────────────────────────

    def _phase2_sequential_generate(
        self,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        arch_spec: str,
        blueprint: dict[str, Any],
    ) -> dict[str, str]:
        """Generate files one-by-one following blueprint dependency order."""
        self._log_event("Phase 2: Sequential generation (blueprint-guided)")

        generated_files: dict[str, str] = {}
        code_memory: dict[str, dict[str, Any]] = {}  # CodeMem summaries

        # Sort files by generation_order
        file_specs = blueprint.get("files", [])
        file_specs = [f for f in file_specs if isinstance(f, dict)]

        # Ensure generation_order exists; default to list position
        for i, fs in enumerate(file_specs):
            if "generation_order" not in fs:
                fs["generation_order"] = i + 1

        file_specs.sort(key=lambda f: f.get("generation_order", 99))

        for file_spec in file_specs:
            file_name = file_spec.get("name", "")
            if not file_name:
                continue

            self._log_event(
                f"  Generating {file_name} "
                f"(order={file_spec.get('generation_order')})"
            )

            # Build dependency context
            deps = file_spec.get("dependencies", [])
            dep_summaries = ""
            dep_code = ""

            for dep in deps:
                if isinstance(dep, str):
                    if dep in code_memory:
                        dep_summaries += (
                            f"\n### {dep} (summary)\n"
                            + json.dumps(code_memory[dep], indent=2)
                            + "\n"
                        )
                    if dep in generated_files:
                        dep_code += (
                            f"\n### {dep}\n```python\n"
                            + generated_files[dep]
                            + "\n```\n"
                        )

            if not dep_summaries:
                dep_summaries = "(no dependencies yet)"
            if not dep_code:
                dep_code = "(no dependencies yet)"

            # Generate this file via LLM
            file_spec_str = json.dumps(file_spec, indent=2, default=str)
            sp = self._pm.sub_prompt(
                "generate_single_file",
                file_name=file_name,
                file_spec=file_spec_str,
                blueprint=arch_spec,
                dependency_summaries=dep_summaries,
                dependency_code=dep_code,
                topic=topic,
                exp_plan=exp_plan[:4000],  # Truncate to avoid token overflow
                pkg_hint=pkg_hint,
            )
            resp = self._chat(sp.system, sp.user, max_tokens=8192)

            # Extract code from response
            code = self._extract_single_file_code(resp.content, file_name)
            if not code:
                self._log_event(f"  WARNING: Empty code for {file_name}")
                continue

            generated_files[file_name] = code

            # Build CodeMem summary via AST
            code_memory[file_name] = self._build_code_summary(
                file_name, code,
            )

            self._log_event(
                f"  {file_name}: {len(code.split(chr(10)))} lines, "
                f"{len(code_memory[file_name].get('classes', []))} classes"
            )

        # Verify we have main.py
        if "main.py" not in generated_files:
            self._log_event("  WARNING: No main.py generated, promoting first file")
            if generated_files:
                first_key = next(iter(generated_files))
                generated_files["main.py"] = generated_files.pop(first_key)

        self._log_event(
            f"  Sequential generation complete: {len(generated_files)} files"
        )
        return generated_files

    @staticmethod
    def _extract_single_file_code(content: str, expected_name: str) -> str:
        """Extract Python code from LLM response for a single file."""
        # Try to extract from ```python``` block
        m = re.search(r"```python\s*\n(.*?)```", content, re.DOTALL)
        if m:
            return m.group(1).strip()

        # Try ```filename:xxx.py block
        m = re.search(
            rf"```(?:filename:)?{re.escape(expected_name)}\s*\n(.*?)```",
            content, re.DOTALL,
        )
        if m:
            return m.group(1).strip()

        # If content looks like raw Python (starts with import/from/# or def)
        stripped = content.strip()
        if stripped and (
            stripped.startswith("import ")
            or stripped.startswith("from ")
            or stripped.startswith("#")
            or stripped.startswith("def ")
            or stripped.startswith("class ")
            or stripped.startswith('"""')
        ):
            return stripped

        return ""

    @staticmethod
    def _build_code_summary(
        filename: str, code: str,
    ) -> dict[str, Any]:
        """Build a CodeMem-style compressed summary via AST analysis."""
        summary: dict[str, Any] = {
            "filename": filename,
            "classes": [],
            "functions": [],
            "imports": [],
        }

        try:
            tree = ast.parse(code)
        except SyntaxError:
            summary["parse_error"] = True
            return summary

        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef):
                methods = []
                for n in node.body:
                    if isinstance(n, ast.FunctionDef):
                        args = [a.arg for a in n.args.args if a.arg != "self"]
                        methods.append({
                            "name": n.name,
                            "args": args,
                        })
                summary["classes"].append({
                    "name": node.name,
                    "bases": [ast.unparse(b) for b in node.bases],
                    "methods": methods,
                })
            elif isinstance(node, ast.FunctionDef) and node.col_offset == 0:
                args = [a.arg for a in node.args.args]
                summary["functions"].append({
                    "name": node.name,
                    "args": args,
                })
            elif isinstance(node, (ast.Import, ast.ImportFrom)):
                try:
                    summary["imports"].append(ast.unparse(node))
                except Exception:
                    pass

        return summary

    # ── Phase 2.5: Hard Validation Gates (E-03) ─────────────────────────

    def _hard_validate_and_repair(
        self,
        files: dict[str, str],
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        arch_spec: str,
    ) -> dict[str, str]:
        """Run AST-based hard validation and repair critical issues.

        Critical issues trigger targeted file regeneration.  Non-critical
        issues are logged as warnings only.
        """
        self._log_event("Phase 2.5: Hard validation gates")

        for attempt in range(self._cfg.hard_validation_max_repairs + 1):
            critical, warnings = self._hard_validate(files)

            # Log warnings
            for w in warnings:
                self._log_event(f"  WARNING: {w}")

            if not critical:
                self._log_event(
                    f"  Hard validation passed "
                    f"({len(warnings)} warning(s), attempt {attempt})"
                )
                return files

            self._log_event(
                f"  Hard validation found {len(critical)} CRITICAL issue(s) "
                f"(attempt {attempt}/{self._cfg.hard_validation_max_repairs})"
            )
            for c in critical:
                self._log_event(f"  CRITICAL: {c}")

            if attempt >= self._cfg.hard_validation_max_repairs:
                self._log_event(
                    "  Max repair attempts reached — proceeding with warnings"
                )
                return files

            # Targeted repair: ask LLM to fix specific critical issues
            files = self._repair_critical_issues(
                files, critical, topic, exp_plan, metric, arch_spec,
            )

        return files

    def _hard_validate(
        self, files: dict[str, str],
    ) -> tuple[list[str], list[str]]:
        """Run AST-based checks and classify as CRITICAL or WARNING.

        Returns (critical_issues, warning_issues).
        """
        critical: list[str] = []
        warnings: list[str] = []

        from researchclaw.experiment.validator import (
            check_class_quality,
            check_code_complexity,
            check_api_correctness,
            check_variable_scoping,
            validate_syntax,
        )

        # 1. Syntax check — always critical
        for fname, code in files.items():
            if not fname.endswith(".py"):
                continue
            syn = validate_syntax(code)
            if not syn.ok:
                for issue in syn.errors:
                    critical.append(
                        f"[{fname}] Syntax error: {issue.message} "
                        f"(line {issue.line})"
                    )

        # 2. Class quality — some are critical
        class_warns = check_class_quality(files)
        for w in class_warns:
            if "identical AST to parent" in w:
                critical.append(w)
            elif "NOT a real ablation" in w:
                critical.append(w)
            elif "creates nn.Module" in w and "inside forward()" in w:
                critical.append(w)
            elif "empty or trivial subclass" in w:
                # Critical: ablation classes must have real implementations
                critical.append(w)
            else:
                warnings.append(w)

        # 3. Code complexity — hardcoded metrics are critical
        for fname, code in files.items():
            if not fname.endswith(".py"):
                continue
            complexity_warns = check_code_complexity(code)
            for w in complexity_warns:
                if "hardcoded metric" in w.lower():
                    critical.append(f"[{fname}] {w}")
                elif "trivial computation" in w.lower():
                    critical.append(f"[{fname}] {w}")
                else:
                    warnings.append(f"[{fname}] {w}")

        # 4. API correctness — NameError-causing issues are critical
        for fname, code in files.items():
            if not fname.endswith(".py"):
                continue
            api_warns = check_api_correctness(code, fname)
            for w in api_warns:
                if "NameError" in w or "Import-usage mismatch" in w:
                    critical.append(w)
                elif "does not exist" in w:
                    critical.append(w)
                else:
                    warnings.append(w)

        # 5. Variable scoping — UnboundLocalError is critical
        for fname, code in files.items():
            if not fname.endswith(".py"):
                continue
            scope_warns = check_variable_scoping(code, fname)
            for w in scope_warns:
                if "UnboundLocalError" in w:
                    critical.append(w)
                else:
                    warnings.append(w)

        # 6. Cross-file import consistency — check local imports resolve
        known_modules = {
            fname.replace(".py", "")
            for fname in files
            if fname.endswith(".py")
        }
        for fname, code in files.items():
            if not fname.endswith(".py"):
                continue
            try:
                tree = ast.parse(code)
            except SyntaxError:
                continue
            for node in ast.walk(tree):
                if isinstance(node, ast.ImportFrom) and node.module:
                    mod_top = node.module.split(".")[0]
                    # Check if importing from a local module that exists
                    if mod_top in known_modules:
                        # Verify imported names exist in target file
                        target_file = f"{mod_top}.py"
                        if target_file in files and node.names:
                            target_code = files[target_file]
                            try:
                                target_tree = ast.parse(target_code)
                            except SyntaxError:
                                continue
                            exported = set()
                            for tnode in ast.walk(target_tree):
                                if isinstance(tnode, ast.ClassDef):
                                    exported.add(tnode.name)
                                elif isinstance(tnode, ast.FunctionDef):
                                    exported.add(tnode.name)
                                elif isinstance(tnode, ast.Assign):
                                    for t in tnode.targets:
                                        if isinstance(t, ast.Name):
                                            exported.add(t.id)
                            for alias in node.names:
                                name = alias.name
                                if name != "*" and name not in exported:
                                    critical.append(
                                        f"[{fname}] ImportError: "
                                        f"'{name}' not defined in "
                                        f"'{target_file}' — will crash"
                                    )

        # 7. BUG-R41-04: main.py MUST have an `if __name__ == "__main__"` block
        #    and must call a training/experiment function — otherwise Docker runs
        #    the file and exits 0 with no output.
        main_code = files.get("main.py", "")
        if main_code:
            try:
                main_tree = ast.parse(main_code)
                has_main_guard = False
                for node in ast.walk(main_tree):
                    if isinstance(node, ast.If):
                        # Check for `if __name__ == "__main__"` pattern
                        test = node.test
                        if isinstance(test, ast.Compare):
                            left = test.left
                            if (
                                isinstance(left, ast.Name)
                                and left.id == "__name__"
                                and len(test.comparators) == 1
                            ):
                                comp = test.comparators[0]
                                if (
                                    isinstance(comp, ast.Constant)
                                    and comp.value == "__main__"
                                ):
                                    has_main_guard = True
                                    break
                if not has_main_guard:
                    critical.append(
                        "[main.py] Missing `if __name__ == \"__main__\":` block — "
                        "script will define functions/classes but never execute "
                        "training. Add a main guard that calls the experiment entry "
                        "point."
                    )
            except SyntaxError:
                pass  # Already caught by syntax check above

        return critical, warnings

    def _repair_critical_issues(
        self,
        files: dict[str, str],
        critical_issues: list[str],
        topic: str,
        exp_plan: str,
        metric: str,
        arch_spec: str,
    ) -> dict[str, str]:
        """Ask LLM to fix critical validation issues."""
        self._log_event("  Targeted repair for critical issues")

        # Identify which files need repair
        affected_files: set[str] = set()
        for issue in critical_issues:
            # Extract filename from issue string: [filename.py] ...
            m = re.match(r"\[([^\]]+\.py)\]", issue)
            if m:
                affected_files.add(m.group(1))
            else:
                # If no filename found, assume all files affected
                affected_files.update(
                    f for f in files if f.endswith(".py")
                )

        if not affected_files:
            affected_files.update(f for f in files if f.endswith(".py"))

        files_ctx = self._format_files(files)
        issues_text = "\n".join(f"- {issue}" for issue in critical_issues)

        prompt = (
            "Your generated code has CRITICAL issues that will cause "
            "runtime failures or produce invalid results. Fix ALL of them.\n\n"
            "## Critical Issues Found\n"
            f"{issues_text}\n\n"
            "## Architecture Blueprint\n"
            f"{arch_spec[:4000]}\n\n"
            "## Current Code\n"
            f"{files_ctx}\n\n"
            "## Rules\n"
            "1. Fix every critical issue listed above\n"
            "2. Ablation/variant classes MUST have different implementations "
            "from their parent — change the forward() or core method\n"
            "3. Never hardcode metric values — compute them from actual data\n"
            "4. nn.Module layers must be created in __init__(), not forward()\n"
            "5. All cross-file imports must reference names that actually exist\n"
            "6. Output ALL files in ```filename:xxx.py``` format\n"
        )

        sys_prompt = self._pm.system("code_generation")
        resp = self._chat(sys_prompt, prompt, max_tokens=16384)

        fixed = self._extract_files(resp.content)
        if fixed:
            merged = dict(files)
            merged.update(fixed)
            self._log_event(
                f"  Repair updated {len(fixed)} file(s): "
                f"{', '.join(sorted(fixed))}"
            )
            return merged

        self._log_event("  WARNING: Repair produced no extractable files")
        return files

    # ── Phase 2b: Single-Shot Generate + Exec-Fix (legacy) ───────────────

    def _phase2_generate_and_fix(
        self,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        arch_spec: str,
        max_tokens: int,
    ) -> dict[str, str]:
        """Generate code in single shot, then iteratively fix via sandbox."""
        self._log_event("Phase 2: Single-shot generate + exec-fix")

        # Initial generation (uses the existing code_generation prompt)
        files = self._generate_code(
            topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens,
        )
        if not files:
            self._log_event("  WARNING: empty generation, returning fallback")
            return files

        return self._exec_fix_loop(files)

    def _exec_fix_loop(self, files: dict[str, str]) -> dict[str, str]:
        """Run exec-fix loop if sandbox is available."""
        if not self._sandbox_factory or self._cfg.exec_fix_max_iterations <= 0:
            return files

        for i in range(self._cfg.exec_fix_max_iterations):
            result = self._run_in_sandbox(files)
            if result.returncode == 0:
                self._log_event(f"  Exec-fix iter {i}: code runs OK")
                break

            self._log_event(
                f"  Exec-fix iter {i}: crashed (rc={result.returncode}), "
                f"stderr={len(result.stderr or '')} chars"
            )
            files = self._fix_runtime_error(files, result)

        return files

    def _generate_code(
        self,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        arch_spec: str,
        max_tokens: int,
    ) -> dict[str, str]:
        """Single code generation call with architecture spec injected."""
        # Inject architecture specification into the pkg_hint slot
        hint = pkg_hint
        if arch_spec:
            hint = (
                f"{pkg_hint}\n\n"
                "## ARCHITECTURE SPECIFICATION (follow this file and class structure)\n"
                f"{arch_spec}\n"
            )

        # BUG-004: Inject numerical stability requirements
        hint += (
            "\n\n## NUMERICAL STABILITY (MANDATORY)\n"
            "- Add gradient clipping: `torch.nn.utils.clip_grad_norm_(params, 1.0)`\n"
            "- After each optimizer step, check for NaN loss:\n"
            "  `if torch.isnan(loss): print('FAIL: NaN detected'); break`\n"
            "- When logging metrics, guard against NaN/Inf:\n"
            "  `v = float(val); v = 0.0 if (math.isnan(v) or math.isinf(v)) else v`\n"
            "- For RL: clip rewards to [-10, 10], use reward normalization\n"
        )

        sp = self._pm.for_stage(
            "code_generation",
            topic=topic,
            metric=metric,
            pkg_hint=hint,
            exp_plan=exp_plan,
        )
        resp = self._chat(sp.system, sp.user, max_tokens=max_tokens)

        files = self._extract_files(resp.content)
        if not files and resp.content.strip():
            # Retry with higher token budget
            self._log_event("  Empty extraction, retrying with 32768 tokens")
            resp = self._chat(sp.system, sp.user, max_tokens=32768)
            files = self._extract_files(resp.content)

        return files

    def _fix_runtime_error(
        self, files: dict[str, str], result: Any,
    ) -> dict[str, str]:
        """Fix a runtime error using targeted or full-file repair.

        E-05: Parse the error traceback to identify the failing file and
        line, then send only the affected file with a focused context
        window.  Falls back to full-file repair if parsing fails.
        """
        stderr_tail = (result.stderr or "")[-3000:]
        stdout_tail = "\n".join(
            (result.stdout or "").split("\n")[-50:]
        )

        # Try targeted repair first (E-05)
        error_loc = self._parse_error_location(stderr_tail, files)
        if error_loc:
            fname, lineno, error_msg = error_loc
            self._log_event(
                f"  Targeted repair: {fname}:{lineno} — {error_msg[:80]}"
            )
            fixed = self._targeted_file_repair(
                files, fname, lineno, error_msg, stderr_tail,
            )
            if fixed:
                return fixed

        # Fallback: full-file repair
        files_ctx = self._format_files(files)
        sp = self._pm.sub_prompt(
            "code_exec_fix",
            stderr=stderr_tail or "(empty)",
            stdout_tail=stdout_tail or "(empty)",
            returncode=str(result.returncode),
            files_context=files_ctx,
        )
        resp = self._chat(sp.system, sp.user, max_tokens=16384)

        fixed = self._extract_files(resp.content)
        if fixed:
            merged = dict(files)
            merged.update(fixed)
            return merged
        return files

    @staticmethod
    def _parse_error_location(
        stderr: str, files: dict[str, str],
    ) -> tuple[str, int, str] | None:
        """Parse Python traceback to find failing file and line.

        Returns (filename, line_number, error_message) or None.
        """
        known_files = set(files.keys())
        # Parse traceback lines: File "xxx.py", line NNN
        tb_pattern = re.compile(
            r'File "(?:[^"]*[/\\])?([^"]+\.py)", line (\d+)'
        )
        matches = list(tb_pattern.finditer(stderr))
        if not matches:
            return None

        # Find the last match that references one of our files
        for m in reversed(matches):
            fname = m.group(1)
            lineno = int(m.group(2))
            if fname in known_files:
                # Extract error message (last line of stderr)
                lines = stderr.strip().split("\n")
                error_msg = lines[-1] if lines else "Unknown error"
                return fname, lineno, error_msg

        return None

    def _targeted_file_repair(
        self,
        files: dict[str, str],
        target_file: str,
        error_line: int,
        error_msg: str,
        full_stderr: str,
    ) -> dict[str, str] | None:
        """Repair a single file with focused context around the error."""
        if target_file not in files:
            return None

        code = files[target_file]
        code_lines = code.split("\n")
        total_lines = len(code_lines)

        # Extract context window: ±30 lines around error
        window = 30
        start = max(0, error_line - window - 1)
        end = min(total_lines, error_line + window)
        context_lines = code_lines[start:end]

        # Number the lines for the LLM
        numbered = "\n".join(
            f"{start + i + 1:4d} | {line}"
            for i, line in enumerate(context_lines)
        )

        # Build compact dependency context (summaries only)
        dep_summaries = ""
        for fname, fcode in files.items():
            if fname != target_file and fname.endswith(".py"):
                summary = self._build_code_summary(fname, fcode)
                dep_summaries += (
                    f"\n### {fname}: "
                    f"{len(summary.get('classes', []))} classes, "
                    f"{len(summary.get('functions', []))} functions\n"
                )
                for cls in summary.get("classes", []):
                    methods = ", ".join(
                        m["name"] for m in cls.get("methods", [])
                    )
                    dep_summaries += (
                        f"  class {cls['name']}"
                        f"({', '.join(cls.get('bases', []))})"
                        f": [{methods}]\n"
                    )

        prompt = (
            f"Fix the runtime error in `{target_file}` at line {error_line}.\n\n"
            f"## Error\n```\n{error_msg}\n```\n\n"
            f"## Full Traceback (last 1500 chars)\n"
            f"```\n{full_stderr[-1500:]}\n```\n\n"
            f"## {target_file} (lines {start + 1}-{end})\n"
            f"```python\n{numbered}\n```\n\n"
            f"## Other Files in Project\n{dep_summaries}\n\n"
            f"## Full File ({target_file}, {total_lines} lines)\n"
            f"```python\n{code}\n```\n\n"
            f"Output the COMPLETE fixed `{target_file}` in "
            f"```filename:{target_file}``` format. Fix the root cause, "
            f"not just the symptom."
        )

        sys_prompt = (
            "You are a debugging expert. Fix the specific runtime error "
            "shown. Preserve experiment design and scientific methodology. "
            "Output the COMPLETE fixed file."
        )
        resp = self._chat(sys_prompt, prompt, max_tokens=16384)

        fixed = self._extract_files(resp.content)
        if not fixed:
            # Try extracting as single file
            code_match = re.search(
                r"```(?:python|filename:\S+)\s*\n(.*?)```",
                resp.content, re.DOTALL,
            )
            if code_match:
                fixed = {target_file: code_match.group(1).strip()}

        if fixed and target_file in fixed:
            merged = dict(files)
            merged.update(fixed)
            self._log_event(
                f"  Targeted repair applied to {target_file} "
                f"({len(fixed[target_file].split(chr(10)))} lines)"
            )
            return merged

        return None

    # ── Phase 3: Solution Tree Search ─────────────────────────────────────

    def _phase3_tree_search(
        self,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        arch_spec: str,
        max_tokens: int,
    ) -> tuple[SolutionNode, int]:
        """Explore multiple candidate solutions via tree search."""
        self._log_event("Phase 3: Solution tree search")
        all_nodes: list[SolutionNode] = []

        # Generate initial candidates
        n_cand = max(self._cfg.tree_search_candidates, 1)
        for k in range(n_cand):
            self._log_event(f"  Generating candidate {k + 1}/{n_cand}")
            files = self._generate_code(
                topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens,
            )
            node = SolutionNode(
                node_id=f"gen-{k}",
                files=files,
                depth=0,
                generation_method="initial",
            )
            all_nodes.append(node)

        # Iterative evaluate-fix-branch loop
        for depth in range(self._cfg.tree_search_max_depth):
            # Evaluate unevaluated nodes
            for node in all_nodes:
                if not node.evaluated:
                    self._evaluate_node(node, metric)

            # Sort by score
            all_nodes.sort(key=lambda n: n.score, reverse=True)

            self._log_event(
                f"  Depth {depth}: {len(all_nodes)} nodes, "
                f"best={all_nodes[0].node_id} score={all_nodes[0].score:.2f}"
            )

            # If best runs OK, we're done
            if all_nodes[0].runs_ok:
                break

            # Generate fix variants for top-2 crashing candidates
            new_nodes: list[SolutionNode] = []
            for node in all_nodes[:2]:
                if not node.runs_ok:
                    fixed_files = self._fix_runtime_error(
                        node.files,
                        _SimpleResult(
                            returncode=node.returncode,
                            stdout=node.stdout,
                            stderr=node.stderr,
                        ),
                    )
                    new_node = SolutionNode(
                        node_id=f"{node.node_id}-fix{depth}",
                        files=fixed_files,
                        parent_id=node.node_id,
                        depth=depth + 1,
                        generation_method="fix",
                    )
                    new_nodes.append(new_node)

            all_nodes.extend(new_nodes)

        # Final evaluation of any remaining unevaluated nodes
        for node in all_nodes:
            if node.returncode == -1:
                self._evaluate_node(node, metric)

        all_nodes.sort(key=lambda n: n.score, reverse=True)
        best = all_nodes[0]
        self._log_event(
            f"  Tree search complete: best={best.node_id} "
            f"score={best.score:.2f}, explored {len(all_nodes)} nodes"
        )

        return best, len(all_nodes)

    def _evaluate_node(self, node: SolutionNode, metric_key: str) -> None:
        """Run a node's code in sandbox and update its score."""
        if not node.files:
            node.score = 0.0
            return

        result = self._run_in_sandbox(
            node.files,
            timeout_sec=self._cfg.tree_search_eval_timeout_sec,
        )
        node.evaluated = True
        node.returncode = result.returncode
        node.stdout = result.stdout
        node.stderr = result.stderr
        node.runs_ok = result.returncode == 0
        node.metrics = dict(result.metrics) if result.metrics else {}
        node.score = self._score_node(node, metric_key)

    @staticmethod
    def _score_node(node: SolutionNode, metric_key: str) -> float:
        """Score a solution node based on execution results."""
        score = 0.0
        if node.runs_ok:
            score += 1.0
        if node.stdout and len(node.stdout) > 100:
            score += 0.3  # produces meaningful output
        if node.metrics:
            score += 0.5
            if metric_key in node.metrics:
                score += 0.5
        if node.stderr and "Error" in node.stderr:
            score -= 0.2
        return max(score, 0.0)

    # ── Phase 5: Multi-Agent Review Dialog ────────────────────────────────

    def _phase4_review(
        self,
        files: dict[str, str],
        topic: str,
        exp_plan: str,
        metric: str,
    ) -> tuple[dict[str, str], int]:
        """Reviewer agent examines code; coder fixes critical issues."""
        self._log_event("Phase 4: Review dialog")

        rounds = 0
        for r in range(self._cfg.review_max_rounds):
            rounds += 1
            files_ctx = self._format_files(files)

            sp = self._pm.sub_prompt(
                "code_reviewer",
                topic=topic,
                exp_plan=exp_plan,
                metric=metric,
                files_context=files_ctx,
            )
            resp = self._chat(sp.system, sp.user, max_tokens=4096)

            review = self._parse_json(resp.content)
            if not isinstance(review, dict) or not review:
                self._log_event(
                    f"  Review round {r + 1}: could not parse JSON, skipping"
                )
                break

            verdict = review.get("verdict", "APPROVE")
            score = review.get("score", 10)
            critical = review.get("critical_issues", [])

            self._log_event(
                f"  Review round {r + 1}: verdict={verdict}, score={score}, "
                f"critical_issues={len(critical)}"
            )

            if verdict == "APPROVE" or not critical:
                break

            # Fix critical issues using the code_generation system prompt
            fix_prompt = (
                "A code reviewer found these critical issues in your experiment code.\n"
                "Fix ALL of them while preserving the experiment design.\n\n"
                "## Critical Issues\n"
                + "\n".join(f"- {issue}" for issue in critical)
                + f"\n\n## Current Code\n{files_ctx}\n\n"
                "Output ALL files in ```filename:xxx.py``` format, "
                "including unchanged files."
            )
            sys_prompt = self._pm.system("code_generation")
            fix_resp = self._chat(sys_prompt, fix_prompt, max_tokens=16384)

            fixed = self._extract_files(fix_resp.content)
            if fixed:
                files = dict(files)
                files.update(fixed)

        return files, rounds

    # ── Helpers ────────────────────────────────────────────────────────────

    def _chat(self, system: str, user: str, max_tokens: int = 8192) -> Any:
        """Make an LLM call and track count."""
        self._calls += 1
        messages = [{"role": "user", "content": user}]
        return self._llm.chat(
            messages=messages,
            system=system,
            max_tokens=max_tokens,
        )

    def _get_or_create_sandbox(self) -> _SandboxLike:
        """Lazily create a single sandbox instance for all validation runs."""
        if self._sandbox is None:
            sandbox_dir = self._stage_dir / "agent_sandbox"
            sandbox_dir.mkdir(parents=True, exist_ok=True)
            self._sandbox = self._sandbox_factory(
                self._exp_config, sandbox_dir,
            )
        return self._sandbox

    def _run_in_sandbox(
        self,
        files: dict[str, str],
        timeout_sec: int | None = None,
    ) -> Any:
        """Write files to a temp directory and run in sandbox."""
        if not self._sandbox_factory:
            raise RuntimeError("No sandbox factory configured")

        self._runs += 1
        timeout = timeout_sec or self._cfg.exec_fix_timeout_sec

        # Write files to a numbered attempt directory
        run_dir = self._stage_dir / "agent_runs" / f"attempt_{self._runs:03d}"
        run_dir.mkdir(parents=True, exist_ok=True)
        for fname, code in files.items():
            fpath = (run_dir / fname).resolve()
            # BUG-CA-10: Prevent path traversal from LLM-generated filenames
            if not fpath.is_relative_to(run_dir.resolve()):
                self._log_event(f"  WARNING: Skipping path-traversal filename: {fname}")
                continue
            fpath.parent.mkdir(parents=True, exist_ok=True)
            fpath.write_text(code, encoding="utf-8")

        # Run using the sandbox
        sandbox = self._get_or_create_sandbox()
        try:
            result = sandbox.run_project(run_dir, timeout_sec=timeout)
        except Exception as exc:
            self._log_event(f"  Sandbox run failed: {exc}")
            result = _SimpleResult(
                returncode=1,
                stdout="",
                stderr=f"Sandbox exception: {exc}",
            )

        return result

    def _extract_files(self, content: str) -> dict[str, str]:
        """Extract multi-file code blocks from LLM output."""
        # Local import to avoid circular dependency with executor.py
        from researchclaw.pipeline.executor import _extract_multi_file_blocks

        return _extract_multi_file_blocks(content)

    @staticmethod
    def _format_files(files: dict[str, str]) -> str:
        """Format files for inclusion in a prompt."""
        parts = []
        for fname in sorted(files):
            parts.append(f"```filename:{fname}\n{files[fname]}\n```")
        return "\n\n".join(parts)

    @staticmethod
    def _parse_json(text: str) -> dict[str, Any] | None:
        """Best-effort JSON extraction from LLM response.

        BUG-17: Always returns ``dict | None`` — never a bare string or list,
        which would cause ``.get()`` crashes in callers.
        """
        def _as_dict(val: Any) -> dict[str, Any] | None:
            return val if isinstance(val, dict) else None

        # Direct parse
        try:
            return _as_dict(json.loads(text))
        except (json.JSONDecodeError, ValueError):
            pass
        # ```json``` fenced block
        m = re.search(r"```json\s*\n(.*?)```", text, re.DOTALL)
        if m:
            try:
                return _as_dict(json.loads(m.group(1)))
            except (json.JSONDecodeError, ValueError):
                pass
        # First {...} object (supports up to 2 levels of nesting)
        m = re.search(
            r"\{[^{}]*(?:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*)*\}",
            text, re.DOTALL,
        )
        if m:
            try:
                return _as_dict(json.loads(m.group(0)))
            except (json.JSONDecodeError, ValueError):
                pass
        return None

    def _log_event(self, msg: str) -> None:
        """Log to both Python logger and the internal validation log."""
        logger.info("[CodeAgent] %s", msg)
        self._log.append(msg)


# ---------------------------------------------------------------------------
# Lightweight result stand-in for error plumbing
# ---------------------------------------------------------------------------


@dataclass
class _SimpleResult:
    """Minimal sandbox result for internal error plumbing."""

    returncode: int = 1
    stdout: str = ""
    stderr: str = ""
    elapsed_sec: float = 0.0
    metrics: dict[str, object] = field(default_factory=dict)
    timed_out: bool = False


================================================
FILE: researchclaw/pipeline/contracts.py
================================================
"""Stage I/O contracts for the 23-stage ResearchClaw pipeline.

Each StageContract declares:
  - input_files: artifacts this stage reads (produced by prior stages)
  - output_files: artifacts this stage must produce
  - dod: Definition of Done — human-readable acceptance criterion
  - error_code: unique error identifier for diagnostics
  - max_retries: how many times the stage may be retried on failure
"""

from __future__ import annotations

from dataclasses import dataclass

from researchclaw.pipeline.stages import Stage


@dataclass(frozen=True)
class StageContract:
    stage: Stage
    input_files: tuple[str, ...]
    output_files: tuple[str, ...]
    dod: str
    error_code: str
    max_retries: int = 1


CONTRACTS: dict[Stage, StageContract] = {
    # Phase A: Research Scoping
    Stage.TOPIC_INIT: StageContract(
        stage=Stage.TOPIC_INIT,
        input_files=(),
        output_files=("goal.md", "hardware_profile.json"),
        dod="SMART goal statement with topic, scope, and constraints",
        error_code="E01_INVALID_GOAL",
        max_retries=0,
    ),
    Stage.PROBLEM_DECOMPOSE: StageContract(
        stage=Stage.PROBLEM_DECOMPOSE,
        input_files=("goal.md",),
        output_files=("problem_tree.md",),
        dod=">=3 prioritized sub-questions identified",
        error_code="E02_DECOMP_FAIL",
    ),
    # Phase B: Literature Discovery
    Stage.SEARCH_STRATEGY: StageContract(
        stage=Stage.SEARCH_STRATEGY,
        input_files=("problem_tree.md",),
        output_files=("search_plan.yaml", "sources.json", "queries.json"),
        dod=">=2 search strategies defined with verified data sources",
        error_code="E03_STRATEGY_BAD",
    ),
    Stage.LITERATURE_COLLECT: StageContract(
        stage=Stage.LITERATURE_COLLECT,
        input_files=("search_plan.yaml",),
        output_files=("candidates.jsonl",),
        dod=">=N candidate papers collected from specified sources",
        error_code="E04_COLLECT_EMPTY",
        max_retries=2,
    ),
    Stage.LITERATURE_SCREEN: StageContract(
        stage=Stage.LITERATURE_SCREEN,
        input_files=("candidates.jsonl",),
        output_files=("shortlist.jsonl",),
        dod="Relevance + quality dual screening completed and approved",
        error_code="E05_GATE_REJECT",
        max_retries=0,
    ),
    Stage.KNOWLEDGE_EXTRACT: StageContract(
        stage=Stage.KNOWLEDGE_EXTRACT,
        input_files=("shortlist.jsonl",),
        output_files=("cards/",),
        dod="Structured knowledge card per shortlisted paper",
        error_code="E06_EXTRACT_FAIL",
    ),
    # Phase C: Knowledge Synthesis
    Stage.SYNTHESIS: StageContract(
        stage=Stage.SYNTHESIS,
        input_files=("cards/",),
        output_files=("synthesis.md",),
        dod="Topic clusters + >=2 research gaps identified",
        error_code="E07_SYNTHESIS_WEAK",
    ),
    Stage.HYPOTHESIS_GEN: StageContract(
        stage=Stage.HYPOTHESIS_GEN,
        input_files=("synthesis.md",),
        output_files=("hypotheses.md",),
        dod=">=2 falsifiable research hypotheses",
        error_code="E08_HYP_INVALID",
    ),
    # Phase D: Experiment Design
    Stage.EXPERIMENT_DESIGN: StageContract(
        stage=Stage.EXPERIMENT_DESIGN,
        input_files=("hypotheses.md",),
        output_files=("exp_plan.yaml",),
        dod="Experiment plan with baselines, ablations, metrics approved",
        error_code="E09_GATE_REJECT",
        max_retries=0,
    ),
    Stage.CODE_GENERATION: StageContract(
        stage=Stage.CODE_GENERATION,
        input_files=("exp_plan.yaml",),
        output_files=("experiment/", "experiment_spec.md"),
        dod="Multi-file experiment project + spec document",
        error_code="E10_CODEGEN_FAIL",
        max_retries=2,
    ),
    Stage.RESOURCE_PLANNING: StageContract(
        stage=Stage.RESOURCE_PLANNING,
        input_files=("exp_plan.yaml",),
        output_files=("schedule.json",),
        dod="Resource schedule with GPU/time estimates",
        error_code="E11_SCHED_CONFLICT",
    ),
    # Phase E: Experiment Execution
    Stage.EXPERIMENT_RUN: StageContract(
        stage=Stage.EXPERIMENT_RUN,
        input_files=("schedule.json", "experiment/"),
        output_files=("runs/",),
        dod="All scheduled experiment runs completed with artifacts",
        error_code="E12_RUN_FAIL",
        max_retries=2,
    ),
    Stage.ITERATIVE_REFINE: StageContract(
        stage=Stage.ITERATIVE_REFINE,
        input_files=("runs/",),
        output_files=("refinement_log.json", "experiment_final/"),
        dod="Edit-run-eval loop converged or max iterations reached",
        error_code="E13_REFINE_FAIL",
        max_retries=2,
    ),
    # Phase F: Analysis & Decision
    Stage.RESULT_ANALYSIS: StageContract(
        stage=Stage.RESULT_ANALYSIS,
        input_files=("runs/",),
        output_files=("analysis.md",),
        dod="Metrics analyzed with statistical tests and conclusions",
        error_code="E14_ANALYSIS_ERR",
    ),
    Stage.RESEARCH_DECISION: StageContract(
        stage=Stage.RESEARCH_DECISION,
        input_files=("analysis.md",),
        output_files=("decision.md",),
        dod="PROCEED/PIVOT decision with evidence-based justification",
        error_code="E15_DECISION_FAIL",
    ),
    # Phase G: Paper Writing
    Stage.PAPER_OUTLINE: StageContract(
        stage=Stage.PAPER_OUTLINE,
        input_files=("analysis.md", "decision.md"),
        output_files=("outline.md",),
        dod="Complete paper outline with section-level detail",
        error_code="E16_OUTLINE_FAIL",
    ),
    Stage.PAPER_DRAFT: StageContract(
        stage=Stage.PAPER_DRAFT,
        input_files=("outline.md",),
        output_files=("paper_draft.md",),
        dod="Full paper draft with all sections written",
        error_code="E17_DRAFT_FAIL",
    ),
    Stage.PEER_REVIEW: StageContract(
        stage=Stage.PEER_REVIEW,
        input_files=("paper_draft.md",),
        output_files=("reviews.md",),
        dod=">=2 simulated review perspectives with actionable feedback",
        error_code="E18_REVIEW_FAIL",
    ),
    Stage.PAPER_REVISION: StageContract(
        stage=Stage.PAPER_REVISION,
        input_files=("paper_draft.md", "reviews.md"),
        output_files=("paper_revised.md",),
        dod="All review comments addressed with tracked changes",
        error_code="E19_REVISION_FAIL",
    ),
    # Phase H: Finalization
    Stage.QUALITY_GATE: StageContract(
        stage=Stage.QUALITY_GATE,
        input_files=("paper_revised.md",),
        output_files=("quality_report.json",),
        dod="Quality score meets threshold and approved",
        error_code="E20_GATE_REJECT",
        max_retries=0,
    ),
    Stage.KNOWLEDGE_ARCHIVE: StageContract(
        stage=Stage.KNOWLEDGE_ARCHIVE,
        input_files=(),
        output_files=("archive.md", "bundle_index.json"),
        dod="Retrospective + reproducibility bundle archived",
        error_code="E21_ARCHIVE_FAIL",
    ),
    Stage.EXPORT_PUBLISH: StageContract(
        stage=Stage.EXPORT_PUBLISH,
        input_files=("paper_revised.md",),
        output_files=("paper_final.md", "code/"),
        dod="Final paper exported in target format",
        error_code="E22_EXPORT_FAIL",
    ),
    Stage.CITATION_VERIFY: StageContract(
        stage=Stage.CITATION_VERIFY,
        input_files=("paper_final.md",),  # references.bib is optional (BUG-50)
        output_files=("verification_report.json", "references_verified.bib"),
        dod="All citations verified against real APIs; hallucinated refs flagged",
        error_code="E23_VERIFY_FAIL",
    ),
}


================================================
FILE: researchclaw/pipeline/executor.py
================================================
from __future__ import annotations

import json
import logging
import math
import re
import time as _time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Callable

import yaml

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.hardware import HardwareProfile, detect_hardware, ensure_torch_available, is_metric_name
from researchclaw.llm import create_llm_client
from researchclaw.llm.client import LLMClient
from researchclaw.prompts import PromptManager
from researchclaw.pipeline.stages import (
    NEXT_STAGE,
    Stage,
    StageStatus,
    TransitionEvent,
    TransitionOutcome,
    advance,
    gate_required,
)
from researchclaw.pipeline.contracts import CONTRACTS, StageContract
from researchclaw.experiment.validator import (
    CodeValidation,
    format_issues_for_llm,
    validate_code,
)

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Domain detection (extracted to _domain.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline._domain import (  # noqa: E402
    _DOMAIN_KEYWORDS,
    _detect_domain,
    _is_ml_domain,
)


# ---------------------------------------------------------------------------
# Shared helpers (extracted to _helpers.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline._helpers import (  # noqa: E402
    StageResult,
    _METACLAW_SKILLS_DIR,
    _SANDBOX_SAFE_PACKAGES,
    _STOP_WORDS,
    _build_context_preamble,
    _build_fallback_queries,
    _chat_with_prompt,
    _collect_experiment_results,
    _collect_json_context,
    _default_hypotheses,
    _default_paper_outline,
    _default_quality_report,
    _detect_runtime_issues,
    _ensure_sandbox_deps,
    _extract_code_block,
    _extract_multi_file_blocks,
    _extract_paper_title,
    _extract_topic_keywords,
    _extract_yaml_block,
    _find_prior_file,
    _generate_framework_diagram_prompt,
    _generate_neurips_checklist,
    _get_evolution_overlay,
    _load_hardware_profile,
    _multi_perspective_generate,
    _parse_jsonl_rows,
    _parse_metrics_from_stdout,
    _read_prior_artifact,
    _safe_filename,
    _safe_json_loads,
    _synthesize_perspectives,
    _topic_constraint_block,
    _utcnow_iso,
    _write_jsonl,
    _write_stage_meta,
    reconcile_figure_refs,
)

# ---------------------------------------------------------------------------
# Stages 1-2 (extracted to stage_impls/_topic.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._topic import (  # noqa: E402
    _execute_topic_init,
    _execute_problem_decompose,
)

# ---------------------------------------------------------------------------
# Stages 3-6 (extracted to stage_impls/_literature.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._literature import (  # noqa: E402
    _execute_search_strategy,
    _execute_literature_collect,
    _execute_literature_screen,
    _execute_knowledge_extract,
    _expand_search_queries,
)

# ---------------------------------------------------------------------------
# Stages 7-8 (extracted to stage_impls/_synthesis.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._synthesis import (  # noqa: E402
    _execute_synthesis,
    _execute_hypothesis_gen,
)

# ---------------------------------------------------------------------------
# Stage 9 (extracted to stage_impls/_experiment_design.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._experiment_design import (  # noqa: E402
    _execute_experiment_design,
)

# ---------------------------------------------------------------------------
# Stage 10 (extracted to stage_impls/_code_generation.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._code_generation import (  # noqa: E402
    _execute_code_generation,
)

# ---------------------------------------------------------------------------
# Stages 11-13 (extracted to stage_impls/_execution.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._execution import (  # noqa: E402
    _execute_resource_planning,
    _execute_experiment_run,
    _execute_iterative_refine,
)

# ---------------------------------------------------------------------------
# Stages 14-15 (extracted to stage_impls/_analysis.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._analysis import (  # noqa: E402
    _execute_result_analysis,
    _parse_decision,
    _execute_research_decision,
)

# ---------------------------------------------------------------------------
# Stages 16-17 (extracted to stage_impls/_paper_writing.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._paper_writing import (  # noqa: E402
    _execute_paper_outline,
    _execute_paper_draft,
    _collect_raw_experiment_metrics,
    _write_paper_sections,
    _validate_draft_quality,
    _review_compiled_pdf,
    _check_ablation_effectiveness,
    _detect_result_contradictions,
    _BULLET_LENIENT_SECTIONS,
    _BALANCE_SECTIONS,
)

# ---------------------------------------------------------------------------
# Stages 18-23 (extracted to stage_impls/_review_publish.py)
# ---------------------------------------------------------------------------
from researchclaw.pipeline.stage_impls._review_publish import (  # noqa: E402
    _execute_peer_review,
    _execute_paper_revision,
    _execute_quality_gate,
    _execute_knowledge_archive,
    _execute_export_publish,
    _execute_citation_verify,
    _sanitize_fabricated_data,
    _collect_experiment_evidence,
    _check_citation_relevance,
    _remove_bibtex_entries,
    _remove_citations_from_text,
)


_STAGE_EXECUTORS: dict[Stage, Callable[..., StageResult]] = {
    Stage.TOPIC_INIT: _execute_topic_init,
    Stage.PROBLEM_DECOMPOSE: _execute_problem_decompose,
    Stage.SEARCH_STRATEGY: _execute_search_strategy,
    Stage.LITERATURE_COLLECT: _execute_literature_collect,
    Stage.LITERATURE_SCREEN: _execute_literature_screen,
    Stage.KNOWLEDGE_EXTRACT: _execute_knowledge_extract,
    Stage.SYNTHESIS: _execute_synthesis,
    Stage.HYPOTHESIS_GEN: _execute_hypothesis_gen,
    Stage.EXPERIMENT_DESIGN: _execute_experiment_design,
    Stage.CODE_GENERATION: _execute_code_generation,
    Stage.RESOURCE_PLANNING: _execute_resource_planning,
    Stage.EXPERIMENT_RUN: _execute_experiment_run,
    Stage.ITERATIVE_REFINE: _execute_iterative_refine,
    Stage.RESULT_ANALYSIS: _execute_result_analysis,
    Stage.RESEARCH_DECISION: _execute_research_decision,
    Stage.PAPER_OUTLINE: _execute_paper_outline,
    Stage.PAPER_DRAFT: _execute_paper_draft,
    Stage.PEER_REVIEW: _execute_peer_review,
    Stage.PAPER_REVISION: _execute_paper_revision,
    Stage.QUALITY_GATE: _execute_quality_gate,
    Stage.KNOWLEDGE_ARCHIVE: _execute_knowledge_archive,
    Stage.EXPORT_PUBLISH: _execute_export_publish,
    Stage.CITATION_VERIFY: _execute_citation_verify,
}


def execute_stage(
    stage: Stage,
    *,
    run_dir: Path,
    run_id: str,
    config: RCConfig,
    adapters: AdapterBundle,
    auto_approve_gates: bool = False,
) -> StageResult:
    """Execute one pipeline stage, validate outputs, and apply gate logic."""

    stage_dir = run_dir / f"stage-{int(stage):02d}"
    stage_dir.mkdir(parents=True, exist_ok=True)
    _t_health_start = _time.monotonic()
    contract: StageContract = CONTRACTS[stage]

    if contract.input_files:
        for input_file in contract.input_files:
            found = _read_prior_artifact(run_dir, input_file)
            if found is None:
                result = StageResult(
                    stage=stage,
                    status=StageStatus.FAILED,
                    artifacts=(),
                    error=f"Missing input: {input_file} (required by {stage.name})",
                    decision="retry",
                )
                _write_stage_meta(stage_dir, stage, run_id, result)
                return result

    bridge = config.openclaw_bridge
    if bridge.use_message and config.notifications.on_stage_start:
        adapters.message.notify(
            config.notifications.channel,
            f"stage-{int(stage):02d}-start",
            f"Starting {stage.name}",
        )
    if bridge.use_memory:
        adapters.memory.append("stages", f"{run_id}:{int(stage)}:running")

    llm = None
    try:
        if config.llm.provider == "acp":
            llm = create_llm_client(config)
        else:
            candidate = LLMClient.from_rc_config(config)
            if candidate.config.base_url and candidate.config.api_key:
                llm = candidate
    except Exception as _llm_exc:  # noqa: BLE001
        logger.warning("LLM client creation failed: %s", _llm_exc)
        llm = None

    try:
        _ = advance(stage, StageStatus.PENDING, TransitionEvent.START)
        executor = _STAGE_EXECUTORS[stage]
        prompts = PromptManager(config.prompts.custom_file or None)  # type: ignore[attr-defined]
        try:
            result = executor(
                stage_dir, run_dir, config, adapters, llm=llm, prompts=prompts
            )
        except TypeError as exc:
            if "unexpected keyword argument 'prompts'" not in str(exc):
                raise
            result = executor(stage_dir, run_dir, config, adapters, llm=llm)
    except Exception as exc:  # noqa: BLE001
        logger.exception("Stage %s failed", stage.name)
        result = StageResult(
            stage=stage,
            status=StageStatus.FAILED,
            artifacts=(),
            error=str(exc),
            decision="retry",
        )

    if result.status == StageStatus.DONE:
        for output_file in contract.output_files:
            if output_file.endswith("/"):
                path = stage_dir / output_file.rstrip("/")
                if not path.is_dir() or not any(path.iterdir()):
                    result = StageResult(
                        stage=stage,
                        status=StageStatus.FAILED,
                        artifacts=result.artifacts,
                        error=f"Missing output directory: {output_file}",
                        decision="retry",
                        evidence_refs=result.evidence_refs,
                    )
                    break
            else:
                path = stage_dir / output_file
                if not path.exists() or path.stat().st_size == 0:
                    result = StageResult(
                        stage=stage,
                        status=StageStatus.FAILED,
                        artifacts=result.artifacts,
                        error=f"Missing or empty output: {output_file}",
                        decision="retry",
                        evidence_refs=result.evidence_refs,
                    )
                    break

    # --- MetaClaw PRM quality gate evaluation ---
    try:
        mc_bridge = getattr(config, "metaclaw_bridge", None)
        if (
            mc_bridge
            and getattr(mc_bridge, "enabled", False)
            and result.status == StageStatus.DONE
        ):
            mc_prm = getattr(mc_bridge, "prm", None)
            if mc_prm and getattr(mc_prm, "enabled", False):
                prm_stages = getattr(mc_prm, "gate_stages", (5, 9, 15, 20))
                if int(stage) in prm_stages:
                    from researchclaw.metaclaw_bridge.prm_gate import ResearchPRMGate

                    prm_gate = ResearchPRMGate.from_bridge_config(mc_prm)
                    if prm_gate is not None:
                        # Read stage output for PRM evaluation
                        output_text = ""
                        for art in result.artifacts:
                            art_path = stage_dir / art
                            if art_path.exists() and art_path.is_file():
                                try:
                                    output_text += art_path.read_text(encoding="utf-8")[:4000]
                                except (UnicodeDecodeError, OSError):
                                    pass
                        if output_text:
                            prm_score = prm_gate.evaluate_stage(int(stage), output_text)
                            logger.info(
                                "MetaClaw PRM score for stage %d: %.1f",
                                int(stage),
                                prm_score,
                            )
                            # Write PRM score to stage health
                            import json as _prm_json

                            prm_report = {
                                "stage": int(stage),
                                "prm_score": prm_score,
                                "model": prm_gate.model,
                                "votes": prm_gate.votes,
                            }
                            (stage_dir / "prm_score.json").write_text(
                                _prm_json.dumps(prm_report, indent=2),
                                encoding="utf-8",
                            )
                            # If PRM score is -1 (fail), mark stage as failed
                            if prm_score == -1.0:
                                logger.warning(
                                    "MetaClaw PRM rejected stage %d output",
                                    int(stage),
                                )
                                result = StageResult(
                                    stage=result.stage,
                                    status=StageStatus.FAILED,
                                    artifacts=result.artifacts,
                                    error="PRM quality gate: output below quality threshold",
                                    decision="retry",
                                    evidence_refs=result.evidence_refs,
                                )
    except Exception:  # noqa: BLE001
        logger.warning("MetaClaw PRM evaluation failed (non-blocking)")

    if gate_required(stage, config.security.hitl_required_stages):
        if auto_approve_gates:
            if bridge.use_memory:
                adapters.memory.append("gates", f"{run_id}:{int(stage)}:auto-approved")
        else:
            result = StageResult(
                stage=result.stage,
                status=StageStatus.BLOCKED_APPROVAL,
                artifacts=result.artifacts,
                error=result.error,
                decision="block",
                evidence_refs=result.evidence_refs,
            )
            if bridge.use_message and config.notifications.on_gate_required:
                adapters.message.notify(
                    config.notifications.channel,
                    f"gate-{int(stage):02d}",
                    f"Approval required for {stage.name}",
                )

    if bridge.use_memory:
        adapters.memory.append("stages", f"{run_id}:{int(stage)}:{result.status.value}")

    _write_stage_meta(stage_dir, stage, run_id, result)

    _t_health_end = _time.monotonic()
    stage_health = {
        "stage_id": f"{int(stage):02d}-{stage.name.lower()}",
        "run_id": run_id,
        "duration_sec": round(_t_health_end - _t_health_start, 2),
        "status": result.status.value,
        "artifacts_count": len(result.artifacts),
        "error": result.error,
        "timestamp": _utcnow_iso(),
    }
    try:
        (stage_dir / "stage_health.json").write_text(
            json.dumps(stage_health, indent=2), encoding="utf-8"
        )
    except OSError:
        pass

    return result


================================================
FILE: researchclaw/pipeline/experiment_diagnosis.py
================================================
"""Experiment Diagnosis Agent — analyzes WHY experiments failed.

Parses experiment artifacts (stdout, stderr, metrics, experiment plan)
to produce a structured failure diagnosis with root cause classification
and concrete repair instructions.

Used by the experiment repair loop (``experiment_repair.py``) to generate
targeted fixes via OpenCode.
"""

from __future__ import annotations

import enum
import logging
import math
import re
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)


class DeficiencyType(enum.Enum):
    """Classification of experiment failure modes."""

    NO_CONDITIONS_COMPLETED = "no_conditions"
    TOO_FEW_CONDITIONS = "few_conditions"
    MISSING_BASELINE = "no_baseline"
    MISSING_PROPOSED = "no_proposed"
    INSUFFICIENT_SEEDS = "few_seeds"
    TIME_GUARD_DOMINANT = "time_guard"
    SYNTHETIC_DATA_FALLBACK = "synthetic_data"
    CODE_CRASH = "code_crash"
    MISSING_DEPENDENCY = "missing_dep"
    HYPERPARAMETER_ISSUE = "bad_hyperparams"
    IDENTICAL_CONDITIONS = "identical_conditions"
    PERMISSION_ERROR = "permission_error"
    DATASET_UNAVAILABLE = "dataset_unavailable"
    GPU_OOM = "gpu_oom"


@dataclass
class Deficiency:
    """A single identified deficiency in the experiment."""

    type: DeficiencyType
    severity: str  # "critical" | "major" | "minor"
    description: str
    affected_conditions: list[str] = field(default_factory=list)
    suggested_fix: str = ""
    error_message: str = ""  # The actual error text from logs


@dataclass
class ExperimentDiagnosis:
    """Structured diagnosis of an experiment run."""

    deficiencies: list[Deficiency] = field(default_factory=list)
    repairable: bool = True
    reason: str = ""  # Why not repairable (if applicable)
    summary: str = ""
    conditions_completed: list[str] = field(default_factory=list)
    conditions_failed: list[str] = field(default_factory=list)
    total_planned: int = 0
    completion_rate: float = 0.0

    def has_critical(self) -> bool:
        return any(d.severity == "critical" for d in self.deficiencies)

    def to_repair_prompt(self) -> str:
        """Generate a structured repair prompt for OpenCode."""
        lines = ["## EXPERIMENT DIAGNOSIS\n"]
        lines.append(f"Completion rate: {self.completion_rate:.0%} "
                     f"({len(self.conditions_completed)}/{self.total_planned} conditions)\n")

        if self.conditions_completed:
            lines.append(f"Completed: {', '.join(self.conditions_completed)}")
        if self.conditions_failed:
            lines.append(f"Failed: {', '.join(self.conditions_failed)}\n")

        lines.append("## DEFICIENCIES (ordered by severity)\n")
        for d in sorted(self.deficiencies, key=lambda x: {"critical": 0, "major": 1, "minor": 2}.get(x.severity, 3)):
            lines.append(f"### [{d.severity.upper()}] {d.type.value}")
            lines.append(f"**Problem**: {d.description}")
            if d.error_message:
                lines.append(f"**Error**: ```{d.error_message[:500]}```")
            if d.affected_conditions:
                lines.append(f"**Affected conditions**: {', '.join(d.affected_conditions)}")
            lines.append(f"**Fix**: {d.suggested_fix}\n")

        return "\n".join(lines)

    def to_dict(self) -> dict:
        """Serialize for persistence."""
        return {
            "deficiencies": [
                {
                    "type": d.type.value,
                    "severity": d.severity,
                    "description": d.description,
                    "affected_conditions": d.affected_conditions,
                    "suggested_fix": d.suggested_fix,
                    "error_message": d.error_message,
                }
                for d in self.deficiencies
            ],
            "repairable": self.repairable,
            "reason": self.reason,
            "summary": self.summary,
            "conditions_completed": self.conditions_completed,
            "conditions_failed": self.conditions_failed,
            "total_planned": self.total_planned,
            "completion_rate": self.completion_rate,
        }


# ---------------------------------------------------------------------------
# Experiment Quality Assessment
# ---------------------------------------------------------------------------


class PaperMode(enum.Enum):
    """Paper writing mode based on experiment quality."""

    FULL_PAPER = "full_paper"
    PRELIMINARY_STUDY = "preliminary_study"
    NEGATIVE_RESULT = "negative_result"
    TECHNICAL_REPORT = "technical_report"


@dataclass
class ExperimentQualityAssessment:
    """Deterministic assessment of experiment readiness for paper writing."""

    sufficient: bool
    mode: PaperMode
    deficiencies: list[Deficiency] = field(default_factory=list)
    repair_possible: bool = True
    diagnosis: ExperimentDiagnosis | None = None


def assess_experiment_quality(
    experiment_summary: dict,
    refinement_log: dict | None = None,
    experiment_plan: dict | None = None,
    *,
    min_conditions: int = 3,
    min_seeds: int = 2,
) -> ExperimentQualityAssessment:
    """Deterministic quality assessment of experiment data.

    Parameters
    ----------
    experiment_summary:
        Parsed ``experiment_summary.json``.
    refinement_log:
        Parsed ``refinement_log.json``.
    experiment_plan:
        Parsed experiment plan (conditions list).
    min_conditions:
        Minimum conditions required for ``full_paper`` mode.
    min_seeds:
        Minimum seeds per condition for ``full_paper`` mode.
    """
    # Run full diagnosis
    stdout = _extract_stdout(experiment_summary, refinement_log)
    stderr = _extract_stderr(experiment_summary, refinement_log)
    diagnosis = diagnose_experiment(
        experiment_summary=experiment_summary,
        refinement_log=refinement_log,
        stdout=stdout,
        stderr=stderr,
        experiment_plan=experiment_plan,
    )

    # Determine paper mode
    mode = _select_paper_mode(experiment_summary, diagnosis, min_conditions, min_seeds)
    sufficient = mode == PaperMode.FULL_PAPER
    repair_possible = not diagnosis.has_critical() or diagnosis.repairable

    return ExperimentQualityAssessment(
        sufficient=sufficient,
        mode=mode,
        deficiencies=diagnosis.deficiencies,
        repair_possible=repair_possible,
        diagnosis=diagnosis,
    )


def _select_paper_mode(
    experiment_summary: dict,
    diagnosis: ExperimentDiagnosis,
    min_conditions: int,
    min_seeds: int,
) -> PaperMode:
    """Select paper mode based on experiment quality."""
    # Check for synthetic data
    if any(d.type == DeficiencyType.SYNTHETIC_DATA_FALLBACK for d in diagnosis.deficiencies):
        return PaperMode.TECHNICAL_REPORT

    # Check for no conditions
    if not diagnosis.conditions_completed:
        return PaperMode.TECHNICAL_REPORT

    # Check if only 1 condition
    if len(diagnosis.conditions_completed) <= 1:
        return PaperMode.PRELIMINARY_STUDY

    # Check for sufficient conditions and seeds
    cond_summaries = experiment_summary.get("condition_summaries", {})
    conditions_with_enough_seeds = 0
    for cond_name in diagnosis.conditions_completed:
        cond_data = cond_summaries.get(cond_name, {})
        n_seeds = cond_data.get("n_seeds", 1)
        # Heuristic: count per-seed keys in best_run metrics
        metrics = experiment_summary.get("best_run", {}).get("metrics", {})
        seed_keys = [k for k in metrics if k.startswith(f"{cond_name}/") and re.match(r".*/\d+/", k)]
        # BUG-R6-05: Guard against re.match returning None
        actual_seeds = len(set(
            m.group(1) for k in seed_keys if (m := re.match(r".*/(\d+)/", k)) is not None
        )) if seed_keys else 1
        if actual_seeds >= min_seeds:
            conditions_with_enough_seeds += 1

    if len(diagnosis.conditions_completed) >= min_conditions and conditions_with_enough_seeds >= min_conditions:
        # Check for negative result
        best_run = experiment_summary.get("best_run", {})
        metrics = best_run.get("metrics", {})
        # Simple heuristic: if primary_metric is very low, might be negative result
        # This is refined by checking baseline vs proposed in the full pipeline
        return PaperMode.FULL_PAPER

    if len(diagnosis.conditions_completed) >= 2:
        return PaperMode.PRELIMINARY_STUDY

    return PaperMode.TECHNICAL_REPORT


# ---------------------------------------------------------------------------
# Core diagnosis logic
# ---------------------------------------------------------------------------


def diagnose_experiment(
    experiment_summary: dict,
    refinement_log: dict | None = None,
    stdout: str = "",
    stderr: str = "",
    experiment_plan: dict | None = None,
    *,
    prior_diagnoses: list[dict] | None = None,
) -> ExperimentDiagnosis:
    """Analyze experiment failures and produce structured diagnosis.

    Parameters
    ----------
    experiment_summary:
        Parsed ``experiment_summary.json``.
    refinement_log:
        Parsed ``refinement_log.json``.
    stdout:
        Combined stdout from experiment execution.
    stderr:
        Combined stderr from experiment execution.
    experiment_plan:
        The designed experiment plan (planned conditions).
    prior_diagnoses:
        Previous diagnosis results (to avoid recommending same fix twice).
    """
    diag = ExperimentDiagnosis()

    # Determine planned vs completed conditions
    planned_conditions = _get_planned_conditions(experiment_plan, experiment_summary)
    completed_conditions = _get_completed_conditions(experiment_summary)
    diag.total_planned = len(planned_conditions)
    diag.conditions_completed = sorted(completed_conditions)
    diag.conditions_failed = sorted(set(planned_conditions) - completed_conditions)
    diag.completion_rate = len(completed_conditions) / max(len(planned_conditions), 1)

    combined_output = stdout + "\n" + stderr

    # --- Pattern-based checks ---

    # 1. Missing dependencies
    _check_missing_deps(diag, combined_output)

    # 2. Permission errors
    _check_permission_errors(diag, combined_output)

    # 3. GPU OOM
    _check_gpu_oom(diag, combined_output)

    # 4. Time guard dominance
    _check_time_guard(diag, combined_output, planned_conditions, completed_conditions)

    # 5. Synthetic data fallback
    _check_synthetic_data(diag, combined_output)

    # 6. Dataset unavailability
    _check_dataset_issues(diag, combined_output)

    # 7. Code crashes
    _check_code_crashes(diag, stderr, combined_output)

    # 8. Hyperparameter issues
    _check_hyperparams(diag, combined_output, experiment_summary)

    # 9. Identical conditions
    _check_identical_conditions(diag, experiment_summary)

    # 10. Insufficient seeds
    _check_insufficient_seeds(diag, experiment_summary)

    # 11. Near-random accuracy (BUG-204)
    _check_near_random_accuracy(diag, experiment_summary)

    # 12. No conditions at all
    if not completed_conditions:
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.NO_CONDITIONS_COMPLETED,
            severity="critical",
            description="No experimental conditions completed successfully.",
            suggested_fix="Fix the root cause errors above, then re-run.",
        ))

    # Determine repairability
    _assess_repairability(diag, prior_diagnoses)

    # Build summary
    diag.summary = (
        f"{len(diag.deficiencies)} deficiency(ies) found. "
        f"{len(completed_conditions)}/{len(planned_conditions)} conditions completed. "
        f"Repairable: {diag.repairable}."
    )

    return diag


# ---------------------------------------------------------------------------
# Individual check functions
# ---------------------------------------------------------------------------


def _check_missing_deps(diag: ExperimentDiagnosis, output: str) -> None:
    """Detect missing Python package errors."""
    pattern = re.compile(r"ModuleNotFoundError: No module named '([^']+)'")
    for m in pattern.finditer(output):
        module = m.group(1)
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.MISSING_DEPENDENCY,
            severity="critical",
            description=f"Missing Python package: {module}",
            error_message=m.group(0),
            suggested_fix=f"Add '{module}' to requirements.txt and re-run.",
        ))

    # Also check for missing system libraries (Box2D, etc.)
    if "box2d" in output.lower() or "Box2D" in output:
        if "not available" in output.lower() or "not installed" in output.lower():
            diag.deficiencies.append(Deficiency(
                type=DeficiencyType.MISSING_DEPENDENCY,
                severity="critical",
                description="Box2D library not available — LunarLander environments will fail.",
                suggested_fix="Add 'box2d-py' and 'gymnasium[box2d]' to requirements.txt.",
            ))


def _check_permission_errors(diag: ExperimentDiagnosis, output: str) -> None:
    """Detect file/network permission errors."""
    patterns = [
        (r"PermissionError.*?(?:huggingface|hf|model|download)", "HuggingFace model download blocked"),
        (r"PermissionError", "File permission error"),
        (r"403.*?Forbidden.*?(?:huggingface|hf)", "HuggingFace API access denied"),
    ]
    for pat, desc in patterns:
        if re.search(pat, output, re.IGNORECASE):
            diag.deficiencies.append(Deficiency(
                type=DeficiencyType.PERMISSION_ERROR,
                severity="critical",
                description=desc,
                error_message=_extract_context(output, pat),
                suggested_fix=(
                    "Pre-cache the model in setup.py, or switch to a smaller "
                    "model (e.g., distilgpt2 instead of gpt2). Ensure HF_TOKEN "
                    "is set if using gated models."
                ),
            ))
            break  # One permission error is enough


def _check_gpu_oom(diag: ExperimentDiagnosis, output: str) -> None:
    """Detect GPU out-of-memory errors."""
    if re.search(r"CUDA out of memory|RuntimeError.*?OOM|torch\.cuda\.OutOfMemoryError", output):
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.GPU_OOM,
            severity="major",
            description="GPU out of memory during training.",
            error_message=_extract_context(output, r"CUDA out of memory"),
            suggested_fix=(
                "Reduce batch size by 50%. If still OOM, reduce model size "
                "or use gradient checkpointing."
            ),
        ))


def _check_time_guard(
    diag: ExperimentDiagnosis,
    output: str,
    planned: set[str],
    completed: set[str],
) -> None:
    """Detect time guard killing too many conditions."""
    # Count TIME_GUARD mentions
    time_guard_hits = len(re.findall(r"TIME_GUARD|time.guard|time guard", output, re.IGNORECASE))
    skipped_conditions = planned - completed
    skipped_pct = len(skipped_conditions) / max(len(planned), 1)

    if skipped_pct > 0.5 and len(skipped_conditions) > 1:
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.TIME_GUARD_DOMINANT,
            severity="major",
            description=(
                f"Time guard killed {len(skipped_conditions)}/{len(planned)} conditions "
                f"({skipped_pct:.0%}). Too many conditions for the time budget."
            ),
            affected_conditions=sorted(skipped_conditions),
            suggested_fix=(
                f"Reduce from {len(planned)} to {min(5, len(planned))} conditions "
                f"(keep baseline + proposed + 1 ablation). "
                f"Reduce epochs by 50%. Reduce seeds from 3 to 2."
            ),
        ))


def _check_synthetic_data(diag: ExperimentDiagnosis, output: str) -> None:
    """Detect synthetic/fake data fallback."""
    patterns = [
        r"using synthetic data",
        r"synthetic.*?fallback",
        r"random.*?tokens",
        r"WARNING.*?load failed.*?using",
    ]
    for pat in patterns:
        if re.search(pat, output, re.IGNORECASE):
            diag.deficiencies.append(Deficiency(
                type=DeficiencyType.SYNTHETIC_DATA_FALLBACK,
                severity="critical",
                description="Experiment fell back to synthetic/random data instead of real dataset.",
                error_message=_extract_context(output, pat),
                suggested_fix=(
                    "Fix dataset loading. Use a pre-cached dataset "
                    "(CIFAR-10/MNIST are available at /opt/datasets). "
                    "Ensure download happens in setup.py, not main.py."
                ),
            ))
            break


def _check_dataset_issues(diag: ExperimentDiagnosis, output: str) -> None:
    """Detect dataset loading failures."""
    patterns = [
        (r"FileNotFoundError.*?(?:dataset|data|csv|json)", "Dataset file not found"),
        (r"No such file.*?(?:dataset|data|train|test)", "Dataset path does not exist"),
        # BUG-203: HuggingFace DatasetNotFoundError (e.g. cifar10_corrupted)
        (r"DatasetNotFoundError.*?doesn't exist", "HuggingFace dataset not found on Hub"),
    ]
    for pat, desc in patterns:
        if re.search(pat, output, re.IGNORECASE):
            diag.deficiencies.append(Deficiency(
                type=DeficiencyType.DATASET_UNAVAILABLE,
                severity="critical",
                description=desc,
                error_message=_extract_context(output, pat),
                suggested_fix=(
                    "The dataset does not exist on HuggingFace Hub. "
                    "Use ONLY pre-cached datasets: CIFAR-10, CIFAR-100, MNIST, "
                    "FashionMNIST, STL-10 (available at /opt/datasets). "
                    "Remove the failing download from setup.py and use "
                    "torchvision.datasets with root='/opt/datasets' instead."
                ),
            ))


def _check_code_crashes(diag: ExperimentDiagnosis, stderr: str, output: str) -> None:
    """Detect Python runtime crashes."""
    # Look for tracebacks — use MULTILINE, not DOTALL, so each traceback
    # is matched independently (DOTALL would eat all tracebacks into one).
    tb_pattern = re.compile(
        r"(?:Error|Exception):\s*(.+)$",
        re.MULTILINE,
    )
    seen_errors: set[str] = set()
    for m in tb_pattern.finditer(output):
        error_msg = m.group(1).strip()[:200]
        # Skip if already handled by more specific checks
        if "ModuleNotFoundError" in error_msg:
            continue
        if "PermissionError" in error_msg:
            continue
        if "CUDA out of memory" in error_msg:
            continue
        if "DatasetNotFoundError" in error_msg:
            continue
        if error_msg in seen_errors:
            continue
        seen_errors.add(error_msg)
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.CODE_CRASH,
            severity="major",
            description=f"Runtime error: {error_msg}",
            error_message=m.group(0)[:500],
            suggested_fix="Fix the code error. See traceback for details.",
        ))


def _check_hyperparams(diag: ExperimentDiagnosis, output: str, summary: dict) -> None:
    """Detect hyperparameter issues (diverging loss, NaN gradients)."""
    # NaN in training — use word boundary to avoid matching "Shannan" etc.
    if re.search(r"loss.*?\bnan\b|\bnan\b.*?loss|gradient.*?\bnan\b", output, re.IGNORECASE):
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.HYPERPARAMETER_ISSUE,
            severity="major",
            description="NaN detected in loss or gradients — likely learning rate too high.",
            suggested_fix="Reduce learning rate by 10×. Add gradient clipping (max_norm=1.0).",
        ))

    # Diverging loss — parse each value individually so one malformed value
    # doesn't silence the entire check (EDGE-1 fix)
    loss_values = re.findall(r"loss[=:]\s*([\d.]+)", output, re.IGNORECASE)
    if loss_values:
        losses: list[float] = []
        for v in loss_values[-10:]:
            try:
                losses.append(float(v))
            except (ValueError, TypeError):
                continue
        if losses and any(l > 100 for l in losses):
            diag.deficiencies.append(Deficiency(
                type=DeficiencyType.HYPERPARAMETER_ISSUE,
                severity="major",
                description=f"Loss diverging (max={max(losses):.1f}). Training is unstable.",
                suggested_fix="Reduce learning rate. Add gradient clipping. Check data normalization.",
            ))


def _check_near_random_accuracy(diag: ExperimentDiagnosis, summary: dict) -> None:
    """BUG-204: Detect when all conditions produce near-random accuracy.

    If the metric name suggests accuracy/top-1 and the best value is below
    15%, the model likely isn't learning (wrong LR, broken forward pass, etc.).
    """
    ms = summary.get("metrics_summary", {})
    if not ms:
        return

    # Find accuracy-like metrics
    _ACC_KEYS = {"accuracy", "acc", "top1", "top1_accuracy", "val_acc", "test_acc"}
    best_acc: float | None = None
    acc_key: str = ""
    for key, val in ms.items():
        key_lower = key.lower().split("/")[-1]  # strip condition prefix
        if key_lower in _ACC_KEYS or "accuracy" in key_lower or "top1" in key_lower:
            v = val.get("max", val) if isinstance(val, dict) else val
            try:
                fv = float(v)
            except (TypeError, ValueError):
                continue
            if best_acc is None or fv > best_acc:
                best_acc = fv
                acc_key = key

    if best_acc is not None and 0 < best_acc < 15.0:
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.HYPERPARAMETER_ISSUE,
            severity="critical",
            description=(
                f"Best accuracy is {best_acc:.1f}% ({acc_key}), near random chance. "
                f"The model is not learning."
            ),
            suggested_fix=(
                "Check: (1) Learning rate too high/low — try 0.001 for Adam, 0.1 for SGD. "
                "(2) Data preprocessing — normalize to [0,1] or ImageNet stats. "
                "(3) Forward pass — ensure loss backward reaches all parameters. "
                "(4) KD — ensure teacher is loaded with correct pretrained weights."
            ),
        ))


def _check_identical_conditions(diag: ExperimentDiagnosis, summary: dict) -> None:
    """Detect ablation conditions producing identical results."""
    warnings = summary.get("ablation_warnings", [])
    if warnings:
        affected = []
        for w in warnings:
            m = re.search(r"Conditions '([^']+)' and '([^']+)'", w)
            if m:
                affected.extend([m.group(1), m.group(2)])
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.IDENTICAL_CONDITIONS,
            severity="major",
            description=(
                f"{len(warnings)} ablation pair(s) produce identical outputs. "
                "The differentiating parameter is likely not wired into the code."
            ),
            affected_conditions=sorted(set(affected)),
            suggested_fix=(
                "Check that each ablation condition actually modifies the model/training. "
                "The condition parameter must affect the forward pass, not just be logged."
            ),
        ))


def _check_insufficient_seeds(diag: ExperimentDiagnosis, summary: dict) -> None:
    """Check if completed conditions have too few seeds."""
    metrics = summary.get("best_run", {}).get("metrics", {})
    seed_pattern = re.compile(r"^(.+)/(\d+)/(.+)$")
    cond_seeds: dict[str, set[int]] = {}
    for key in metrics:
        m = seed_pattern.match(key)
        if m:
            cond_name, seed_str = m.group(1), m.group(2)
            cond_seeds.setdefault(cond_name, set()).add(int(seed_str))

    single_seed_conds = [c for c, seeds in cond_seeds.items() if len(seeds) < 2]
    if single_seed_conds:
        diag.deficiencies.append(Deficiency(
            type=DeficiencyType.INSUFFICIENT_SEEDS,
            severity="minor",
            description=f"{len(single_seed_conds)} condition(s) have only 1 seed (no variance estimate).",
            affected_conditions=single_seed_conds,
            suggested_fix="Increase seeds to at least 2 per condition, or reduce epoch count to fit time budget.",
        ))


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _get_planned_conditions(plan: dict | None, summary: dict) -> set[str]:
    """Extract planned condition names from experiment plan or summary."""
    if plan:
        conditions = plan.get("conditions", [])
        if isinstance(conditions, list):
            return {c.get("name", str(c)) if isinstance(c, dict) else str(c) for c in conditions}
    # Fallback: use condition_summaries from summary
    return set(summary.get("condition_summaries", {}).keys())


def _get_completed_conditions(summary: dict) -> set[str]:
    """Extract conditions that actually produced metrics."""
    completed = set()
    cond_summaries = summary.get("condition_summaries", {})
    for cond_name, data in cond_summaries.items():
        metrics = data.get("metrics", {})
        if metrics and any(
            isinstance(v, (int, float)) and math.isfinite(v) for v in metrics.values()
        ):
            completed.add(cond_name)
    return completed


def _extract_stdout(summary: dict, ref_log: dict | None) -> str:
    """Extract combined stdout from experiment artifacts."""
    parts: list[str] = []
    # From best_run
    stdout = summary.get("best_run", {}).get("stdout", "")
    if stdout:
        parts.append(stdout)
    # From refinement log iterations
    if ref_log:
        for it in ref_log.get("iterations", []):
            for key in ("sandbox", "sandbox_after_fix"):
                sb = it.get(key, {})
                if isinstance(sb, dict):
                    out = sb.get("stdout", "")
                    if out:
                        parts.append(out)
    return "\n".join(parts)


def _extract_stderr(summary: dict, ref_log: dict | None) -> str:
    """Extract combined stderr from experiment artifacts."""
    parts: list[str] = []
    stderr = summary.get("best_run", {}).get("stderr", "")
    if stderr:
        parts.append(stderr)
    if ref_log:
        for it in ref_log.get("iterations", []):
            for key in ("sandbox", "sandbox_after_fix"):
                sb = it.get(key, {})
                if isinstance(sb, dict):
                    err = sb.get("stderr", "")
                    if err:
                        parts.append(err)
    return "\n".join(parts)


def _extract_context(text: str, pattern: str, context_chars: int = 200) -> str:
    """Extract surrounding context for an error pattern match."""
    m = re.search(pattern, text, re.IGNORECASE)
    if not m:
        return ""
    start = max(0, m.start() - context_chars // 2)
    end = min(len(text), m.end() + context_chars // 2)
    return text[start:end].strip()


def _assess_repairability(diag: ExperimentDiagnosis, prior: list[dict] | None) -> None:
    """Determine if the experiment can be repaired."""
    if not diag.deficiencies:
        diag.repairable = True
        return

    # Count how many times we've tried to fix the same issues
    if prior:
        prior_types = set()
        for pd in prior:
            for d in pd.get("deficiencies", []):
                prior_types.add(d.get("type", ""))
        current_types = {d.type.value for d in diag.deficiencies}
        repeated = current_types & prior_types
        if len(repeated) >= 3:
            diag.repairable = False
            diag.reason = f"Same deficiencies recur after {len(prior)} repair cycles: {repeated}"
            return

    # All types are potentially repairable
    diag.repairable = True


================================================
FILE: researchclaw/pipeline/experiment_repair.py
================================================
"""Experiment Repair Loop — diagnose, fix, and re-run experiments.

Orchestrates the cycle:
  1. Diagnose failures (``experiment_diagnosis.py``)
  2. Generate fixes via OpenCode or LLM
  3. Re-run experiment in sandbox/Docker
  4. Re-assess quality
  5. Repeat until sufficient or max cycles reached

Integrates between Stage 14 (result_analysis) and Stage 15 (research_decision).
"""

from __future__ import annotations

import json
import logging
import re
import time as _time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from researchclaw.pipeline.experiment_diagnosis import (
    DeficiencyType,
    ExperimentDiagnosis,
    ExperimentQualityAssessment,
    PaperMode,
    assess_experiment_quality,
    diagnose_experiment,
)

logger = logging.getLogger(__name__)

MAX_REPAIR_CYCLES = 3

# Regex for extracting ```python filename.py\n...\n``` blocks from LLM output
_CODE_BLOCK_RE = re.compile(
    r"```(?:python)?\s*([\w./\\-]+\.(?:py|txt))\s*\n(.*?)```",
    re.DOTALL,
)
# Fallback: unnamed python blocks
_UNNAMED_BLOCK_RE = re.compile(
    r"```python\s*\n(.*?)```",
    re.DOTALL,
)


@dataclass
class RepairCycleResult:
    """Result of one repair cycle."""

    cycle: int
    diagnosis: ExperimentDiagnosis
    repair_applied: bool = False
    repair_description: str = ""
    new_assessment: ExperimentQualityAssessment | None = None
    error: str = ""


@dataclass
class ExperimentRepairResult:
    """Final result of the entire repair loop."""

    success: bool  # True if experiment is now sufficient for full_paper
    total_cycles: int
    final_mode: PaperMode
    final_assessment: ExperimentQualityAssessment | None = None
    cycle_history: list[RepairCycleResult] = field(default_factory=list)
    best_experiment_summary: dict | None = None

    def to_dict(self) -> dict:
        return {
            "success": self.success,
            "total_cycles": self.total_cycles,
            "final_mode": self.final_mode.value,
            "cycle_history": [
                {
                    "cycle": cr.cycle,
                    "repair_applied": cr.repair_applied,
                    "repair_description": cr.repair_description,
                    "error": cr.error,
                    "diagnosis_summary": cr.diagnosis.summary if cr.diagnosis else "",
                }
                for cr in self.cycle_history
            ],
        }


# ---------------------------------------------------------------------------
# Repair prompt generation
# ---------------------------------------------------------------------------


def build_repair_prompt(
    diagnosis: ExperimentDiagnosis,
    original_code: dict[str, str],
    experiment_plan: dict | None = None,
    time_budget_sec: int = 2400,
) -> str:
    """Build a structured repair prompt for OpenCode or LLM.

    Parameters
    ----------
    diagnosis:
        The structured diagnosis from ``diagnose_experiment()``.
    original_code:
        Mapping of filename → source code for the current experiment.
    experiment_plan:
        The experiment design plan.
    time_budget_sec:
        Available time budget for the experiment.

    Returns
    -------
    str
        A formatted prompt suitable for OpenCode or code-generation LLM.
    """
    sections: list[str] = []

    sections.append("# EXPERIMENT REPAIR TASK\n")
    sections.append(
        "The previous experiment run had failures. Your job is to fix "
        "the specific issues identified below. Do NOT rewrite from scratch — "
        "fix ONLY the identified problems.\n"
    )

    # Diagnosis section
    sections.append(diagnosis.to_repair_prompt())

    # Scope reduction guidance
    if any(d.type == DeficiencyType.TIME_GUARD_DOMINANT for d in diagnosis.deficiencies):
        n_planned = diagnosis.total_planned
        n_completed = len(diagnosis.conditions_completed)
        max_conditions = max(3, n_completed + 1)
        sections.append(
            f"\n## SCOPE REDUCTION REQUIRED\n"
            f"The experiment had {n_planned} conditions but only {n_completed} "
            f"completed within the time budget of {time_budget_sec}s.\n"
            f"**Reduce to at most {max_conditions} conditions:**\n"
            f"1. Keep the BASELINE condition (no modification)\n"
            f"2. Keep the PROPOSED method (paper's main contribution)\n"
            f"3. Keep 1 ablation (remove most impactful component)\n"
            f"4. Remove all other conditions\n"
            f"5. Reduce epochs by 30-50% if still tight on time\n"
            f"6. Reduce seeds from 3 to 2 if needed\n"
        )

    # Dependency fixes
    dep_issues = [d for d in diagnosis.deficiencies if d.type == DeficiencyType.MISSING_DEPENDENCY]
    if dep_issues:
        sections.append("\n## DEPENDENCY FIXES\n")
        sections.append("Add these to requirements.txt:\n")
        for d in dep_issues:
            # Extract package name from description
            sections.append(f"- {d.description}")

    # Original code
    sections.append("\n## CURRENT CODE (fix in-place)\n")
    for filename, content in sorted(original_code.items()):
        # Truncate very long files
        if len(content) > 5000:
            content = content[:5000] + "\n... (truncated)"
        sections.append(f"### {filename}\n```python\n{content}\n```\n")

    # Constraints
    sections.append(
        f"\n## CONSTRAINTS\n"
        f"- Time budget: {time_budget_sec} seconds total\n"
        f"- Pre-cached datasets: CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10 at /opt/datasets\n"
        f"- Every condition MUST output: condition=CONDNAME metric=VALUE\n"
        f"- The code must run without errors for at least 1 seed per condition\n"
    )

    # Output format instruction
    sections.append(
        "\n## OUTPUT FORMAT\n"
        "Output each fixed file using this format:\n"
        "```python filename.py\n"
        "<fixed code>\n"
        "```\n"
        "Include ALL files (main.py, requirements.txt, setup.py if needed).\n"
        "For requirements.txt, use:\n"
        "```python requirements.txt\n"
        "<package list>\n"
        "```\n"
    )

    return "\n".join(sections)


# ---------------------------------------------------------------------------
# Best results selection
# ---------------------------------------------------------------------------


def select_best_results(
    run_dir: Path,
    cycle_history: list[RepairCycleResult],
) -> dict | None:
    """Select the best experiment_summary across all repair cycles.

    Looks for experiment_summary.json files in versioned stage directories
    and returns the one with the best primary metric / most conditions.

    Returns None if no valid summary found.
    """
    candidates: list[tuple[float, int, dict]] = []

    # Check main stage-14
    main_summary = _try_load_summary(run_dir / "stage-14" / "experiment_summary.json")
    if main_summary:
        score = _summary_quality_score(main_summary)
        candidates.append((score, 0, main_summary))

    # Check repair versions
    for i in range(1, MAX_REPAIR_CYCLES + 1):
        path = run_dir / f"stage-14_repair_v{i}" / "experiment_summary.json"
        summary = _try_load_summary(path)
        if summary:
            score = _summary_quality_score(summary)
            candidates.append((score, i, summary))

    if not candidates:
        return None

    # Sort by quality score (descending)
    candidates.sort(key=lambda x: x[0], reverse=True)
    best_score, best_cycle, best_summary = candidates[0]
    logger.info(
        "Best experiment results from cycle %d (score=%.2f)", best_cycle, best_score
    )
    return best_summary


def _try_load_summary(path: Path) -> dict | None:
    """Try to load and parse an experiment_summary.json."""
    if not path.exists():
        return None
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        return None


def _summary_quality_score(summary: dict) -> float:
    """Compute a simple quality score for ranking summaries.

    Higher = better. Considers:
    - Number of completed conditions (×10)
    - Whether primary_metric is non-NaN (×5)
    - Number of metric keys (×1)
    """
    import math

    score = 0.0
    n_conditions = len(summary.get("condition_summaries", {}))
    score += n_conditions * 10.0

    pm = summary.get("best_run", {}).get("metrics", {}).get("primary_metric")
    if isinstance(pm, (int, float)) and math.isfinite(pm):
        score += 5.0

    n_keys = summary.get("total_metric_keys", 0)
    score += n_keys * 1.0

    return score


# ---------------------------------------------------------------------------
# Full repair loop
# ---------------------------------------------------------------------------


def run_repair_loop(
    run_dir: Path,
    config: Any,
    run_id: str = "",
) -> ExperimentRepairResult:
    """Execute the full experiment repair loop.

    After Stage 14 diagnosis finds quality issues:
    1. Load current experiment code
    2. For each cycle: diagnose → LLM/OpenCode fix → re-run in sandbox → re-assess
    3. Select best results across all cycles
    4. Return structured result

    Parameters
    ----------
    run_dir:
        Path to the pipeline run directory (contains stage-* subdirs).
    config:
        RCConfig instance with experiment and LLM settings.
    run_id:
        Pipeline run ID for logging.

    Returns
    -------
    ExperimentRepairResult
    """
    repair_cfg = config.experiment.repair

    # Load initial experiment summary
    summary = _load_experiment_summary(run_dir)
    if not summary:
        logger.warning("[%s] Repair loop: no experiment_summary.json found", run_id)
        return ExperimentRepairResult(
            success=False, total_cycles=0, final_mode=PaperMode.TECHNICAL_REPORT,
        )

    # Initial quality assessment — pass user-configured thresholds
    ref_log = _load_refinement_log(run_dir)
    _min_cond = getattr(repair_cfg, "min_conditions", 3)
    qa = assess_experiment_quality(summary, ref_log, min_conditions=_min_cond)
    if qa.sufficient:
        logger.info("[%s] Repair loop: experiment already sufficient (%s)", run_id, qa.mode.value)
        return ExperimentRepairResult(
            success=True, total_cycles=0, final_mode=qa.mode,
            final_assessment=qa, best_experiment_summary=summary,
        )

    # Load experiment code
    code = _load_experiment_code(run_dir)
    if not code:
        logger.warning("[%s] Repair loop: no experiment code found", run_id)
        return ExperimentRepairResult(
            success=False, total_cycles=0, final_mode=qa.mode,
        )

    # Collect stdout/stderr for diagnosis
    stdout, stderr = _collect_experiment_output(run_dir)

    # Load experiment plan
    plan = _load_experiment_plan(run_dir)

    # Create LLM client
    try:
        from researchclaw.llm import create_llm_client
        llm = create_llm_client(config)
    except Exception as exc:
        logger.error("[%s] Repair loop: cannot create LLM client: %s", run_id, exc)
        return ExperimentRepairResult(
            success=False, total_cycles=0, final_mode=qa.mode,
        )

    cycle_history: list[RepairCycleResult] = []
    best_summary = summary
    best_mode = qa.mode
    best_updated = False
    max_cycles = min(repair_cfg.max_cycles, MAX_REPAIR_CYCLES)
    loop_start = _time.monotonic()
    prior_diagnoses: list[dict] = []

    for cycle in range(1, max_cycles + 1):
        logger.info("[%s] Repair cycle %d/%d starting...", run_id, cycle, max_cycles)
        print(f"[{run_id}] Repair cycle {cycle}/{max_cycles}...")

        # 1. Diagnose current state
        diag = diagnose_experiment(
            experiment_summary=summary,
            experiment_plan=plan,
            refinement_log=ref_log,
            stdout=stdout,
            stderr=stderr,
            prior_diagnoses=prior_diagnoses or None,
        )
        prior_diagnoses.append(diag.to_dict() if hasattr(diag, "to_dict") else {})

        # 2. Build repair prompt
        repair_prompt = build_repair_prompt(
            diag, code, experiment_plan=plan,
            time_budget_sec=config.experiment.time_budget_sec,
        )

        # 3. Get fixed code via LLM (with OpenCode fallback)
        fixed_code = _get_repaired_code(
            repair_prompt, code, llm, config, run_dir, cycle,
        )

        if not fixed_code:
            cycle_result = RepairCycleResult(
                cycle=cycle, diagnosis=diag,
                repair_applied=False,
                error="Failed to generate repaired code",
            )
            cycle_history.append(cycle_result)
            logger.warning("[%s] Repair cycle %d: code generation failed", run_id, cycle)
            break

        # 4. Save fixed code to versioned directory
        repair_dir = run_dir / f"stage-14_repair_v{cycle}"
        repair_dir.mkdir(parents=True, exist_ok=True)
        exp_dir = repair_dir / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)

        for fname, content in fixed_code.items():
            (exp_dir / fname).write_text(content, encoding="utf-8")
        logger.info(
            "[%s] Repair cycle %d: saved %d files to %s",
            run_id, cycle, len(fixed_code), exp_dir,
        )

        # 5. Re-run experiment in sandbox
        sandbox_result = _run_experiment_in_sandbox(
            exp_dir, config, repair_dir,
            timeout_sec=repair_cfg.timeout_sec_per_cycle,
        )

        if sandbox_result is None:
            cycle_result = RepairCycleResult(
                cycle=cycle, diagnosis=diag,
                repair_applied=True,
                repair_description=f"Fixed {len(fixed_code)} files",
                error="Sandbox execution failed",
            )
            cycle_history.append(cycle_result)
            logger.warning("[%s] Repair cycle %d: sandbox execution failed", run_id, cycle)
            continue

        # 6. Build new experiment summary from sandbox results
        new_summary = _build_experiment_summary_from_run(sandbox_result, fixed_code)
        (repair_dir / "experiment_summary.json").write_text(
            json.dumps(new_summary, indent=2), encoding="utf-8"
        )

        # 7. Re-assess quality
        new_qa = assess_experiment_quality(new_summary, min_conditions=_min_cond)
        new_score = _summary_quality_score(new_summary)
        old_score = _summary_quality_score(best_summary)

        cycle_result = RepairCycleResult(
            cycle=cycle,
            diagnosis=diag,
            repair_applied=True,
            repair_description=(
                f"Fixed {len(fixed_code)} files; "
                f"score {old_score:.1f} → {new_score:.1f}; "
                f"mode: {new_qa.mode.value}"
            ),
            new_assessment=new_qa,
        )
        cycle_history.append(cycle_result)

        # Track best
        if new_score > _summary_quality_score(best_summary):
            best_summary = new_summary
            best_mode = new_qa.mode
            best_updated = True

        logger.info(
            "[%s] Repair cycle %d: score %.1f → %.1f, mode=%s, sufficient=%s",
            run_id, cycle, old_score, new_score, new_qa.mode.value, new_qa.sufficient,
        )
        print(
            f"[{run_id}] Repair cycle {cycle}: "
            f"score {old_score:.1f} → {new_score:.1f}, "
            f"mode={new_qa.mode.value}"
        )

        if new_qa.sufficient:
            logger.info("[%s] Repair successful after %d cycles!", run_id, cycle)
            print(f"[{run_id}] Experiment repair successful! Mode: {new_qa.mode.value}")
            return ExperimentRepairResult(
                success=True,
                total_cycles=cycle,
                final_mode=new_qa.mode,
                final_assessment=new_qa,
                cycle_history=cycle_history,
                best_experiment_summary=best_summary,
            )

        # Update for next cycle
        code = fixed_code
        summary = new_summary
        stdout = sandbox_result.get("stdout", "")
        stderr = sandbox_result.get("stderr", "")

    # Exhausted all cycles — use best available
    elapsed = _time.monotonic() - loop_start
    logger.info(
        "[%s] Repair loop completed: %d cycles in %.1fs, best mode=%s",
        run_id, len(cycle_history), elapsed, best_mode.value,
    )

    # Promote best summary only if a repair cycle actually improved it
    if best_updated and best_summary is not summary:
        best_path = run_dir / "experiment_summary_best.json"
        best_path.write_text(json.dumps(best_summary, indent=2), encoding="utf-8")

    return ExperimentRepairResult(
        success=False,
        total_cycles=len(cycle_history),
        final_mode=best_mode,
        cycle_history=cycle_history,
        best_experiment_summary=best_summary,
    )


# ---------------------------------------------------------------------------
# Helper: load experiment artifacts
# ---------------------------------------------------------------------------


def _load_experiment_summary(run_dir: Path) -> dict | None:
    """Load the most recent experiment_summary.json."""
    for candidate in sorted(run_dir.glob("stage-14*/experiment_summary.json"), reverse=True):
        try:
            return json.loads(candidate.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue
    return None


def _load_refinement_log(run_dir: Path) -> dict | None:
    """Load the most recent refinement_log.json."""
    for candidate in sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True):
        try:
            return json.loads(candidate.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue
    return None


def _load_experiment_code(run_dir: Path) -> dict[str, str]:
    """Load experiment code from the most recent stage directory.

    Prefers: stage-13/experiment_final/ → stage-10/experiment/ → stage-10/*.py
    """
    code: dict[str, str] = {}

    # Try refined code first
    for refine_dir in sorted(run_dir.glob("stage-13*/experiment_final"), reverse=True):
        if refine_dir.is_dir():
            for py_file in sorted(refine_dir.glob("*.py")):
                try:
                    code[py_file.name] = py_file.read_text(encoding="utf-8")
                except (OSError, UnicodeDecodeError):
                    pass
            # Also grab requirements.txt, setup.py
            for extra in ("requirements.txt", "setup.py"):
                extra_path = refine_dir / extra
                if extra_path.exists():
                    try:
                        code[extra] = extra_path.read_text(encoding="utf-8")
                    except (OSError, UnicodeDecodeError):
                        pass
            if code:
                return code

    # Fall back to stage-10 experiment directory
    for exp_dir in sorted(run_dir.glob("stage-10*/experiment"), reverse=True):
        if exp_dir.is_dir():
            for py_file in sorted(exp_dir.glob("*.py")):
                try:
                    code[py_file.name] = py_file.read_text(encoding="utf-8")
                except (OSError, UnicodeDecodeError):
                    pass
            for extra in ("requirements.txt", "setup.py"):
                extra_path = exp_dir / extra
                if extra_path.exists():
                    try:
                        code[extra] = extra_path.read_text(encoding="utf-8")
                    except (OSError, UnicodeDecodeError):
                        pass
            if code:
                return code

    # Last resort: any .py files in stage-10*
    for stage_dir in sorted(run_dir.glob("stage-10*"), reverse=True):
        for py_file in sorted(stage_dir.glob("*.py")):
            try:
                code[py_file.name] = py_file.read_text(encoding="utf-8")
            except (OSError, UnicodeDecodeError):
                pass
        if code:
            return code

    return code


def _load_experiment_plan(run_dir: Path) -> dict | None:
    """Load experiment plan from stage-09."""
    for candidate in sorted(run_dir.glob("stage-09*/experiment_design.json"), reverse=True):
        try:
            return json.loads(candidate.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue
    return None


def _collect_experiment_output(run_dir: Path) -> tuple[str, str]:
    """Collect stdout/stderr from experiment runs."""
    stdout_parts: list[str] = []
    stderr_parts: list[str] = []

    for stage_dir in sorted(run_dir.glob("stage-14*")):
        runs_dir = stage_dir / "runs"
        if not runs_dir.is_dir():
            continue
        for run_file in sorted(runs_dir.glob("*.json"))[:5]:
            try:
                data = json.loads(run_file.read_text(encoding="utf-8"))
                if isinstance(data, dict):
                    stdout_parts.append(data.get("stdout", ""))
                    stderr_parts.append(data.get("stderr", ""))
            except (json.JSONDecodeError, OSError):
                continue

    return "\n".join(stdout_parts).strip(), "\n".join(stderr_parts).strip()


# ---------------------------------------------------------------------------
# Helper: get repaired code from LLM or OpenCode
# ---------------------------------------------------------------------------


def _get_repaired_code(
    repair_prompt: str,
    current_code: dict[str, str],
    llm: Any,
    config: Any,
    run_dir: Path,
    cycle: int,
) -> dict[str, str] | None:
    """Get repaired code via OpenCode (if available) or LLM fallback.

    Returns merged code dict (current + repaired files) or None on failure.
    """
    repair_cfg = config.experiment.repair

    # Try OpenCode first if enabled
    if repair_cfg.use_opencode and config.experiment.opencode.enabled:
        result = _repair_via_opencode(repair_prompt, current_code, config, run_dir, cycle)
        if result:
            return result
        logger.info("OpenCode repair unavailable, falling back to LLM")

    # LLM repair
    return _repair_via_llm(repair_prompt, current_code, llm)


def _repair_via_opencode(
    repair_prompt: str,
    current_code: dict[str, str],
    config: Any,
    run_dir: Path,
    cycle: int,
) -> dict[str, str] | None:
    """Attempt repair via OpenCode agent."""
    try:
        from researchclaw.pipeline.opencode_bridge import OpenCodeBridge

        _oc_cfg = config.experiment.opencode
        bridge = OpenCodeBridge(
            model=getattr(_oc_cfg, "model", "") or "",
            llm_base_url=getattr(config.llm, "base_url", "") or "",
            api_key_env=getattr(config.llm, "api_key_env", "") or "",
            llm_provider=getattr(config.llm, "provider", "openai-compatible") or "openai-compatible",
            timeout_sec=getattr(_oc_cfg, "timeout_sec", 600),
            max_retries=getattr(_oc_cfg, "max_retries", 1),
            workspace_cleanup=getattr(_oc_cfg, "workspace_cleanup", True),
        )
        workspace = run_dir / f"_repair_opencode_v{cycle}"
        workspace.mkdir(parents=True, exist_ok=True)

        result = bridge.generate(
            stage_dir=workspace,
            topic="experiment repair",
            exp_plan=repair_prompt,
            metric=getattr(config.experiment, "metric_key", "primary_metric"),
            time_budget_sec=getattr(config.experiment, "time_budget_sec", 2400),
        )

        if result.success and result.files:
            # Merge with current code
            merged = dict(current_code)
            merged.update(result.files)
            logger.info(
                "OpenCode repair: %d files generated (%d total after merge)",
                len(result.files), len(merged),
            )
            return merged

    except Exception as exc:
        logger.warning("OpenCode repair failed: %s", exc)

    return None


def _repair_via_llm(
    repair_prompt: str,
    current_code: dict[str, str],
    llm: Any,
) -> dict[str, str] | None:
    """Repair experiment code via LLM chat."""
    system = (
        "You are an expert experiment repair assistant. "
        "Fix the experiment code based on the diagnosis below. "
        "Output ONLY the fixed files. For each file, use this exact format:\n\n"
        "```python filename.py\n"
        "<complete fixed code>\n"
        "```\n\n"
        "Include ALL files that need changes (main.py, requirements.txt, etc.). "
        "Output the COMPLETE file content, not just the changed parts."
    )

    try:
        resp = llm.chat(
            [{"role": "user", "content": repair_prompt}],
            system=system,
        )
        content = resp.content
    except Exception as exc:
        logger.warning("LLM repair call failed: %s", exc)
        return None

    if not content or not content.strip():
        logger.warning("LLM repair returned empty response")
        return None

    # Extract code blocks from response
    files = _extract_code_blocks(content)

    if not files:
        logger.warning("LLM repair: no code blocks found in response")
        return None

    # Merge with current code (only update files that were fixed)
    merged = dict(current_code)
    merged.update(files)
    logger.info(
        "LLM repair: extracted %d files (%d total after merge)",
        len(files), len(merged),
    )
    return merged


def _extract_code_blocks(text: str) -> dict[str, str]:
    """Extract named code blocks from LLM response.

    Matches patterns like:
        ```python main.py
        <code>
        ```
    """
    files: dict[str, str] = {}

    # Try named blocks first
    for match in _CODE_BLOCK_RE.finditer(text):
        fname = match.group(1).strip()
        code = match.group(2).strip()
        if fname and code:
            # Normalize filename — strip path prefixes
            fname = Path(fname).name
            files[fname] = code

    # If no named blocks, try unnamed and assume main.py
    if not files:
        for match in _UNNAMED_BLOCK_RE.finditer(text):
            code = match.group(1).strip()
            if code and len(code) > 50:  # Skip tiny snippets
                files["main.py"] = code
                break

    return files


# ---------------------------------------------------------------------------
# Helper: run experiment in sandbox
# ---------------------------------------------------------------------------


def _run_experiment_in_sandbox(
    exp_dir: Path,
    config: Any,
    work_dir: Path,
    timeout_sec: int = 600,
) -> dict | None:
    """Run experiment code in Docker/sandbox and return results dict.

    Returns a dict with keys: stdout, stderr, returncode, metrics, elapsed_sec, timed_out.
    Returns None if sandbox creation fails.
    """
    try:
        from researchclaw.experiment.factory import create_sandbox

        sandbox_dir = work_dir / "sandbox"
        sandbox_dir.mkdir(parents=True, exist_ok=True)
        sandbox = create_sandbox(config.experiment, sandbox_dir)

        result = sandbox.run_project(
            exp_dir,
            timeout_sec=timeout_sec,
        )

        return {
            "stdout": result.stdout,
            "stderr": result.stderr,
            "returncode": result.returncode,
            "metrics": dict(result.metrics) if result.metrics else {},
            "elapsed_sec": result.elapsed_sec,
            "timed_out": result.timed_out,
        }

    except Exception as exc:
        logger.warning("Sandbox execution failed: %s", exc)
        return None


def _build_experiment_summary_from_run(
    run_result: dict,
    code: dict[str, str],
) -> dict:
    """Build an experiment_summary.json from a single sandbox run.

    Parses condition-level metrics from stdout and builds the standard
    summary format expected by ``assess_experiment_quality()``.
    """
    metrics = run_result.get("metrics", {})
    stdout = run_result.get("stdout", "")

    # Also parse metrics from stdout if sandbox didn't capture them
    if not metrics and stdout:
        try:
            from researchclaw.experiment.sandbox import parse_metrics
            metrics = parse_metrics(stdout)
        except ImportError:
            pass

    # Group metrics by condition
    condition_summaries: dict[str, dict] = {}
    for key, value in metrics.items():
        if not isinstance(value, (int, float)):
            continue
        parts = key.split("/")
        if len(parts) >= 3:
            # Format: condition_name/seed/metric_name
            cond_name = parts[0]
            metric_name = parts[-1]
            if cond_name not in condition_summaries:
                condition_summaries[cond_name] = {"metrics": {}, "seeds": {}}
            condition_summaries[cond_name]["metrics"][metric_name] = value
            seed_key = "/".join(parts[1:-1])
            condition_summaries[cond_name]["seeds"].setdefault(seed_key, {})[metric_name] = value
        elif len(parts) == 2:
            # BUG-199: Stage 13 refinement produces 2-part keys
            # (condition_name/metric_name) without a seed component.
            # Treat as a single-seed result.
            cond_name, metric_name = parts
            if cond_name not in condition_summaries:
                condition_summaries[cond_name] = {"metrics": {}, "seeds": {}}
            condition_summaries[cond_name]["metrics"][metric_name] = value
            condition_summaries[cond_name]["seeds"].setdefault("0", {})[metric_name] = value
        elif len(parts) == 1:
            # Top-level metric like "primary_metric"
            pass

    # Compute per-condition mean metrics
    for cond_name, cdata in condition_summaries.items():
        seeds = cdata.get("seeds", {})
        if seeds:
            cdata["n_seeds"] = len(seeds)
            # Average each metric across seeds
            all_metrics: dict[str, list[float]] = {}
            for seed_data in seeds.values():
                for mk, mv in seed_data.items():
                    if isinstance(mv, (int, float)):
                        all_metrics.setdefault(mk, []).append(float(mv))
            for mk, values in all_metrics.items():
                if values:
                    cdata["metrics"][mk] = sum(values) / len(values)
        # Remove seeds from final output (not standard format)
        cdata.pop("seeds", None)

    return {
        "condition_summaries": condition_summaries,
        "best_run": {
            "metrics": metrics,
            "status": "completed" if run_result.get("returncode") == 0 else "failed",
            "stdout": stdout[:5000],
            "stderr": run_result.get("stderr", "")[:2000],
        },
        "metrics_summary": {},
        "total_conditions": len(condition_summaries),
        "total_metric_keys": len(metrics),
    }


================================================
FILE: researchclaw/pipeline/opencode_bridge.py
================================================
"""OpenCode 'Beast Mode' bridge — routes complex code generation to OpenCode CLI.

OpenCode (https://github.com/anomalyco/opencode) is an external AI coding agent
invoked via ``opencode run --format json "prompt"``.  This module provides:

1. **ComplexityScore / score_complexity()** — analyses an experiment plan to
   decide whether beast mode is warranted.
2. **OpenCodeBridge** — manages workspace creation, OpenCode invocation, file
   collection, and cleanup.
"""

from __future__ import annotations

import ast
import json
import logging
import os
import re
import shutil
import subprocess
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Complexity scoring
# ---------------------------------------------------------------------------

# Keywords that indicate multi-component architectures
_COMPONENT_KEYWORDS: tuple[str, ...] = (
    "encoder",
    "decoder",
    "discriminator",
    "generator",
    "critic",
    "actor",
    "teacher",
    "student",
    "backbone",
    "head",
    "neck",
    "classifier",
    "embedder",
    "attention",
    "transformer",
    "tokenizer",
    "vae",
    "autoencoder",
)

# Indicators that multi-file generation is needed
_FILE_HINT_KEYWORDS: tuple[str, ...] = (
    "model.py",
    "trainer.py",
    "dataset.py",
    "utils.py",
    "config.py",
    "multiple files",
    "modular",
    "separate module",
    "multi-file",
)

# Domain-complexity keywords
_DOMAIN_COMPLEX_KEYWORDS: tuple[str, ...] = (
    "multi-modal",
    "multimodal",
    "distributed",
    "gan",
    "diffusion",
    "nerf",
    "mixture of experts",
    "moe",
    "meta-learning",
    "meta learning",
    "maml",
    "neural ode",
    "neural sde",
    "physics-informed",
    "pinn",
    "graph neural",
    "gnn",
    "reinforcement learning",
    "multi-agent",
    "world model",
    "vision-language",
    "text-to-image",
    "image-to-text",
)

# Patterns suggesting deep dependency chains
_DEPENDENCY_KEYWORDS: tuple[str, ...] = (
    "custom layer",
    "custom loss",
    "wrapper",
    "registry",
    "hook",
    "callback",
    "scheduler",
    "custom optimizer",
    "custom dataset",
    "custom sampler",
    "custom transform",
)


@dataclass
class ComplexityScore:
    """Result of complexity analysis on an experiment plan."""

    score: float  # 0.0-1.0
    signals: dict[str, float] = field(default_factory=dict)
    recommendation: str = ""  # "beast_mode" | "code_agent" | "legacy"
    reason: str = ""


def _count_keyword_hits(text: str, keywords: tuple[str, ...]) -> int:
    text_lower = text.lower()
    return sum(1 for kw in keywords if kw in text_lower)


def score_complexity(
    exp_plan: str,
    topic: str = "",
    *,
    historical_failures: int = 0,
    threshold: float = 0.6,
) -> ComplexityScore:
    """Score the complexity of an experiment to determine if beast mode is warranted.

    Returns a ComplexityScore with score in [0.0, 1.0].
    """
    if not exp_plan and not topic:
        return ComplexityScore(
            score=0.0,
            signals={},
            recommendation="legacy",
            reason="Empty plan",
        )

    combined = f"{topic}\n{exp_plan}"

    # Signal 1: Component count (weight 0.25)
    comp_hits = _count_keyword_hits(combined, _COMPONENT_KEYWORDS)
    component_score = min(comp_hits / 5.0, 1.0)

    # Signal 2: File count hint (weight 0.20)
    file_hits = _count_keyword_hits(combined, _FILE_HINT_KEYWORDS)
    file_score = min(file_hits / 3.0, 1.0)

    # Signal 3: Domain complexity (weight 0.20)
    domain_hits = _count_keyword_hits(combined, _DOMAIN_COMPLEX_KEYWORDS)
    domain_score = min(domain_hits / 3.0, 1.0)

    # Signal 4: Condition count (weight 0.15)
    # Look for numbered conditions, ablation mentions, variant mentions
    condition_pattern = re.compile(
        r"(?:condition|ablation|variant|experiment)\s*[\-_:]?\s*\d+",
        re.IGNORECASE,
    )
    condition_matches = len(condition_pattern.findall(combined))
    # Also count bullet points in conditions/ablations sections
    condition_matches += combined.lower().count("baseline")
    condition_score = min(condition_matches / 8.0, 1.0)

    # Signal 5: Historical failures (weight 0.10)
    failure_score = min(historical_failures / 3.0, 1.0)

    # Signal 6: Dependency depth (weight 0.10)
    dep_hits = _count_keyword_hits(combined, _DEPENDENCY_KEYWORDS)
    dep_score = min(dep_hits / 3.0, 1.0)

    # Weighted sum
    weighted = (
        0.25 * component_score
        + 0.20 * file_score
        + 0.20 * domain_score
        + 0.15 * condition_score
        + 0.10 * failure_score
        + 0.10 * dep_score
    )
    final_score = min(max(weighted, 0.0), 1.0)

    signals = {
        "component_count": round(component_score, 3),
        "file_count_hint": round(file_score, 3),
        "domain_complexity": round(domain_score, 3),
        "condition_count": round(condition_score, 3),
        "historical_failure": round(failure_score, 3),
        "dependency_depth": round(dep_score, 3),
    }

    if final_score >= threshold:
        recommendation = "beast_mode"
        reason = (
            f"Complexity {final_score:.2f} >= threshold {threshold:.2f}: "
            f"top signals: "
            + ", ".join(
                f"{k}={v:.2f}"
                for k, v in sorted(signals.items(), key=lambda x: -x[1])[:3]
            )
        )
    else:
        recommendation = "code_agent"
        reason = f"Complexity {final_score:.2f} < threshold {threshold:.2f}"

    return ComplexityScore(
        score=round(final_score, 4),
        signals=signals,
        recommendation=recommendation,
        reason=reason,
    )


# ---------------------------------------------------------------------------
# OpenCode bridge
# ---------------------------------------------------------------------------

@dataclass
class OpenCodeResult:
    """Result from an OpenCode invocation."""

    success: bool
    files: dict[str, str] = field(default_factory=dict)
    opencode_log: str = ""
    elapsed_sec: float = 0.0
    error: str = ""


_MEGA_PROMPT_TEMPLATE = """\
You are implementing a complete, runnable ML/science experiment.

Read the files in the current workspace:
- EXPERIMENT_PLAN.yaml — the full experiment design
- GUIDANCE.md — topic, metric, environment constraints, domain-specific guidance

Your task:
1. Design the file structure (main.py is the required entry point).
2. Implement ALL files with complete, runnable code. No placeholders or TODOs.
3. main.py must be the entry point and print the primary metric as:
   {metric}: <value>
4. Include numerical stability guards (gradient clipping, NaN detection, etc.).
5. Use multi-seed evaluation (seeds 0, 1, 2) and report mean ± std.
6. Each ablation/condition MUST be genuinely different — not copy-paste with a renamed variable.
7. Implement a time guard: stop gracefully at 80% of the time budget ({time_budget_sec} seconds).
8. Write requirements.txt listing any extra pip packages needed.
9. If the experiment needs dataset downloads, write a setup.py that handles them.

IMPORTANT CONSTRAINTS:
- The code will run in an isolated Docker container with PyTorch, torchvision, and common ML packages pre-installed.
- Do NOT use argparse or CLI arguments — hardcode all configuration.
- All output must go to stdout (print statements).
- Keep the experiment feasible within {time_budget_sec} seconds total.
"""


class OpenCodeBridge:
    """Manages OpenCode CLI invocations for beast mode code generation."""

    def __init__(
        self,
        *,
        model: str = "",
        llm_base_url: str = "",
        api_key_env: str = "",
        llm_provider: str = "openai-compatible",
        timeout_sec: int = 600,
        max_retries: int = 1,
        workspace_cleanup: bool = True,
    ) -> None:
        self._model = model
        self._llm_base_url = llm_base_url
        self._api_key_env = api_key_env
        self._llm_provider = llm_provider
        self._timeout_sec = timeout_sec
        self._max_retries = max_retries
        self._workspace_cleanup = workspace_cleanup

    # -- availability check ---------------------------------------------------

    @staticmethod
    def check_available() -> bool:
        """Return True if the ``opencode`` CLI is installed and callable."""
        opencode_cmd = shutil.which("opencode")
        if not opencode_cmd:
            return False
            
        try:
            result = subprocess.run(
                [opencode_cmd, "--version"],
                capture_output=True,
                text=True,
                timeout=15,
            )
            return result.returncode == 0
        except FileNotFoundError:
            return False
        except subprocess.TimeoutExpired:
            return False
        except Exception:  # noqa: BLE001
            return False

    # -- workspace preparation ------------------------------------------------

    def _prepare_workspace(
        self,
        stage_dir: Path,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str,
        extra_guidance: str,
        time_budget_sec: int,
    ) -> Path:
        """Create a temporary workspace directory with context files."""
        ws = stage_dir / f"opencode_beast_{int(time.time())}_{time.monotonic_ns() % 100000}"
        ws.mkdir(parents=True, exist_ok=True)

        # Write experiment plan
        (ws / "EXPERIMENT_PLAN.yaml").write_text(
            exp_plan or "# No experiment plan provided\n",
            encoding="utf-8",
        )

        # Write guidance document
        guidance_parts = [
            f"# Experiment Guidance\n",
            f"## Topic\n{topic}\n",
            f"## Primary Metric\n{metric}\n",
            f"## Time Budget\n{time_budget_sec} seconds\n",
        ]
        if pkg_hint:
            guidance_parts.append(f"## Environment\n{pkg_hint}\n")
        if extra_guidance:
            guidance_parts.append(f"## Additional Guidance\n{extra_guidance}\n")
        (ws / "GUIDANCE.md").write_text(
            "\n".join(guidance_parts), encoding="utf-8",
        )

        # Write opencode.json config
        opencode_cfg = self._build_opencode_config()
        (ws / "opencode.json").write_text(
            json.dumps(opencode_cfg, indent=2), encoding="utf-8",
        )

        # OpenCode requires a git repository — initialise one with
        # a single commit so that ``opencode run`` doesn't hang.
        # BUG-OB-01/OB-02: Check return codes and catch TimeoutExpired.
        try:
            r = subprocess.run(
                ["git", "init"],
                cwd=str(ws), capture_output=True, timeout=10,
            )
            if r.returncode != 0:
                raise OSError(f"git init failed: {r.stderr}")
            subprocess.run(
                ["git", "add", "-A"],
                cwd=str(ws), capture_output=True, timeout=10,
            )
            subprocess.run(
                ["git", "-c", "user.email=beast@researchclaw",
                 "-c", "user.name=BeastMode",
                 "commit", "-m", "init workspace"],
                cwd=str(ws), capture_output=True, timeout=10,
            )
        except subprocess.TimeoutExpired as exc:
            raise OSError(f"git workspace init timed out: {exc}") from exc

        return ws

    def _is_azure(self) -> bool:
        """Detect Azure OpenAI from base URL or provider string."""
        return (
            "azure" in (self._llm_base_url or "").lower()
            or "azure" in (self._llm_provider or "").lower()
        )

    def _build_opencode_config(self) -> dict[str, Any]:
        """Build the opencode.json configuration.

        Always uses the "openai" provider — this works for both standard
        OpenAI endpoints and Azure OpenAI (which accepts Bearer token auth
        on the ``/openai/v1`` path and now supports the Responses API).
        """
        cfg: dict[str, Any] = {
            "$schema": "https://opencode.ai/config.json",
        }

        if self._llm_base_url:
            if self._model:
                cfg["model"] = (
                    self._model if "/" in self._model
                    else f"openai/{self._model}"
                )
            cfg["provider"] = {
                "openai": {
                    "options": {
                        "baseURL": self._llm_base_url,
                        "apiKey": f"{{env:{self._api_key_env}}}"
                        if self._api_key_env
                        else "",
                    },
                    "models": {},
                }
            }
            # Register the model so OpenCode knows it exists
            if self._model:
                model_name = self._model.split("/")[-1]
                cfg["provider"]["openai"]["models"] = {
                    model_name: {
                        "name": model_name,
                        "modalities": {
                            "input": ["text"],
                            "output": ["text"],
                        },
                    }
                }
        elif self._model:
            cfg["model"] = (
                self._model if "/" in self._model
                else f"openai/{self._model}"
            )

        return cfg

    # -- model resolution -------------------------------------------------------

    def _resolve_opencode_model(self) -> str:
        """Resolve the model identifier for OpenCode CLI's ``-m`` flag.

        Resolution order:
        1. If model already contains "/" (e.g. "anthropic/claude-sonnet-4-6") → use as-is
        2. Otherwise → "openai/{model}" (works for both Azure and standard OpenAI)

        Note: Azure AI Services now supports the Responses API with Bearer
        token auth via the OpenAI-compatible endpoint, so we use the "openai"
        provider universally — no Anthropic fallback needed.
        """
        if not self._model:
            return "anthropic/claude-sonnet-4-6"
        if "/" in self._model:
            return self._model
        return f"openai/{self._model}"

    # -- invocation ------------------------------------------------------------

    def _invoke_opencode(
        self,
        workspace: Path,
        prompt: str,
    ) -> tuple[bool, str, float]:
        """Run ``opencode run`` in the workspace. Returns (success, log, elapsed)."""
        env = os.environ.copy()
        # Pass API key via environment if configured
        if self._api_key_env:
            api_key = os.environ.get(self._api_key_env, "")
            if api_key:
                # We always use the "openai" provider for OpenCode now,
                # which reads OPENAI_API_KEY (works for Azure too via
                # Bearer token auth on the OpenAI-compatible endpoint).
                env["OPENAI_API_KEY"] = api_key

        # Use -m flag to specify model (more reliable than opencode.json)
        resolved_model = self._resolve_opencode_model()
        opencode_cmd = shutil.which("opencode") or "opencode"
        cmd = [opencode_cmd, "run", "-m", resolved_model, "--format", "json", prompt]

        t0 = time.monotonic()
        try:
            result = subprocess.run(
                cmd,
                cwd=str(workspace),
                capture_output=True,
                text=True,
                timeout=self._timeout_sec,
                env=env,
            )
            elapsed = time.monotonic() - t0
            log = result.stdout + "\n" + result.stderr
            return result.returncode == 0, log, elapsed
        except subprocess.TimeoutExpired as exc:
            elapsed = time.monotonic() - t0
            log = f"TIMEOUT after {elapsed:.1f}s"
            if exc.stdout:
                log += f"\nstdout: {exc.stdout[:2000] if isinstance(exc.stdout, str) else exc.stdout.decode(errors='replace')[:2000]}"
            return False, log, elapsed
        except FileNotFoundError:
            return False, "opencode CLI not found", 0.0
        except Exception as exc:  # noqa: BLE001
            elapsed = time.monotonic() - t0
            return False, f"Unexpected error: {exc}", elapsed

    # -- file collection -------------------------------------------------------

    @staticmethod
    def _collect_files(workspace: Path) -> dict[str, str]:
        """Collect generated Python files, requirements.txt, and setup.py.

        File names are flattened to basenames (e.g. ``src/main.py`` → ``main.py``)
        because the downstream executor expects a flat file dict.  If two files
        share the same basename, the one closer to the workspace root wins.
        """
        files: dict[str, str] = {}
        # Sort by depth (fewer parts first) so root-level files take priority
        py_files = sorted(
            workspace.rglob("*.py"),
            key=lambda p: len(p.relative_to(workspace).parts),
        )
        for py_file in py_files:
            rel = py_file.relative_to(workspace)
            parts = rel.parts
            if any(p.startswith("__pycache__") or p.startswith(".") for p in parts):
                continue
            # Flatten to basename — executor expects flat structure
            basename = rel.name
            if basename not in files:
                try:
                    files[basename] = py_file.read_text(encoding="utf-8", errors="replace")
                except OSError as exc:
                    logger.warning("Beast mode: failed to read %s: %s", py_file, exc)

        # Also collect requirements.txt and setup.py at root
        for extra in ("requirements.txt", "setup.py"):
            p = workspace / extra
            if p.exists() and extra not in files:
                files[extra] = p.read_text(encoding="utf-8", errors="replace")

        return files

    # -- entry-point validation ------------------------------------------------

    @staticmethod
    def _has_main_guard(source: str) -> bool:
        """Return True if *source* contains ``if __name__ == "__main__":``."""
        try:
            tree = ast.parse(source)
        except SyntaxError:
            return False
        for node in ast.walk(tree):
            if isinstance(node, ast.If):
                test = node.test
                if isinstance(test, ast.Compare) and isinstance(test.left, ast.Name):
                    if test.left.id == "__name__" and len(test.comparators) == 1:
                        comp = test.comparators[0]
                        if isinstance(comp, ast.Constant) and comp.value == "__main__":
                            return True
        return False

    @staticmethod
    def _ensure_main_entry_point(files: dict[str, str]) -> dict[str, str]:
        """Ensure ``main.py`` has an ``if __name__ == "__main__"`` guard.

        Beast Mode often generates multi-file projects where ``main.py`` is a
        library module and the real entry point lives in another file (e.g.
        ``run_experiment.py``).  Since the Docker sandbox always executes
        ``python3 main.py``, a library-only ``main.py`` exits immediately with
        no output.

        Strategy:
        1. If ``main.py`` already has the guard → return unchanged.
        2. Find the first other ``.py`` file that **does** have the guard.
        3. Swap: rename that file to ``main.py`` and the old ``main.py`` to a
           helper module (its original basename, or ``_lib.py``).
        4. If no file has a guard, append a minimal stub to ``main.py`` that
           calls the most likely entry function (``main()``, ``run()``, etc.).
        """
        main_code = files.get("main.py", "")
        if not main_code:
            return files

        if OpenCodeBridge._has_main_guard(main_code):
            return files

        # -- Strategy 2/3: find another file with the guard and swap -----------
        for fname, code in files.items():
            if fname == "main.py" or not fname.endswith(".py"):
                continue
            if OpenCodeBridge._has_main_guard(code):
                logger.info(
                    "Beast mode: main.py lacks __main__ guard; swapping "
                    "entry point with %s",
                    fname,
                )
                new_files = dict(files)
                # Rename original main.py → helper module
                helper_name = fname  # reuse the other file's name for old main
                new_files[helper_name] = main_code
                new_files["main.py"] = code
                return new_files

        # -- Strategy 4: inject a minimal entry point into main.py -------------
        # Look for common entry functions defined in main.py
        entry_func: str | None = None
        try:
            tree = ast.parse(main_code)
            candidates = [
                n.name
                for n in ast.walk(tree)
                if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
                and n.name in ("main", "run", "run_experiment", "train",
                               "run_experiments", "experiment", "run_all")
            ]
            if candidates:
                entry_func = candidates[0]
        except SyntaxError:
            pass

        if entry_func:
            logger.info(
                "Beast mode: main.py lacks __main__ guard; injecting call "
                "to %s()",
                entry_func,
            )
            new_files = dict(files)
            new_files["main.py"] = (
                main_code.rstrip()
                + "\n\n\nif __name__ == \"__main__\":\n"
                + f"    {entry_func}()\n"
            )
            return new_files

        logger.warning(
            "Beast mode: main.py lacks __main__ guard and no known entry "
            "function found — experiment may exit without producing output",
        )
        return files

    # -- main entry point ------------------------------------------------------

    def generate(
        self,
        stage_dir: Path,
        topic: str,
        exp_plan: str,
        metric: str,
        pkg_hint: str = "",
        extra_guidance: str = "",
        time_budget_sec: int = 300,
    ) -> OpenCodeResult:
        """Run OpenCode to generate experiment code.

        Returns an OpenCodeResult with success status and generated files.
        """
        # Check availability first
        if not self.check_available():
            return OpenCodeResult(
                success=False,
                error="OpenCode CLI not installed or not callable",
            )

        workspace: Path | None = None
        last_error = ""

        for attempt in range(1 + self._max_retries):
            # Prepare workspace
            try:
                workspace = self._prepare_workspace(
                    stage_dir=stage_dir,
                    topic=topic,
                    exp_plan=exp_plan,
                    metric=metric,
                    pkg_hint=pkg_hint,
                    extra_guidance=extra_guidance,
                    time_budget_sec=time_budget_sec,
                )
            except OSError as exc:
                last_error = f"Failed to prepare workspace: {exc}"
                logger.warning("Beast mode: %s", last_error)
                continue

            # Build the mega-prompt (use replace instead of .format() to
            # avoid KeyError when metric contains curly braces like "F{1}")
            prompt = _MEGA_PROMPT_TEMPLATE.replace(
                "{metric}", metric
            ).replace(
                "{time_budget_sec}", str(time_budget_sec)
            )

            logger.info(
                "Beast mode: invoking OpenCode (attempt %d/%d, timeout=%ds)",
                attempt + 1,
                1 + self._max_retries,
                self._timeout_sec,
            )

            success, log, elapsed = self._invoke_opencode(workspace, prompt)

            if success:
                files = self._collect_files(workspace)
                if "main.py" not in files:
                    logger.warning(
                        "Beast mode: OpenCode succeeded but no main.py found "
                        "(files: %s)", list(files.keys()),
                    )
                    last_error = "No main.py in OpenCode output"
                    # Cleanup failed workspace
                    if self._workspace_cleanup and workspace.exists():
                        shutil.rmtree(workspace, ignore_errors=True)
                    continue

                # BUG-R52-01: Ensure main.py has an entry point
                files = self._ensure_main_entry_point(files)

                # Write log
                try:
                    (stage_dir / "opencode_log.txt").write_text(
                        log or "", encoding="utf-8",
                    )
                except OSError as _wexc:
                    logger.warning("Beast mode: failed to write log: %s", _wexc)

                # Cleanup workspace if configured
                if self._workspace_cleanup and workspace.exists():
                    shutil.rmtree(workspace, ignore_errors=True)

                return OpenCodeResult(
                    success=True,
                    files=files,
                    opencode_log=log,
                    elapsed_sec=elapsed,
                )

            last_error = log
            logger.warning(
                "Beast mode: OpenCode attempt %d failed (%.1fs): %s",
                attempt + 1,
                elapsed,
                log[:500],
            )
            # Cleanup failed workspace
            if self._workspace_cleanup and workspace and workspace.exists():
                shutil.rmtree(workspace, ignore_errors=True)

        # All attempts failed
        return OpenCodeResult(
            success=False,
            opencode_log=last_error,
            error=f"OpenCode failed after {1 + self._max_retries} attempt(s)",
        )


# ---------------------------------------------------------------------------
# Helper: count historical failures
# ---------------------------------------------------------------------------

def count_historical_failures(run_dir: Path, stage_name: str = "stage-10") -> int:
    """Count past Stage 10 failures from stage directories and logs.

    Each stage directory is counted at most once, even if multiple failure
    indicators are present.
    """
    failures = 0
    for d in run_dir.glob(f"{stage_name}*"):
        failed = False
        # Check for beast_mode_log.json
        bm_log = d / "beast_mode_log.json"
        if bm_log.exists():
            try:
                data = json.loads(bm_log.read_text(encoding="utf-8"))
                if not data.get("success", True):
                    failed = True
            except Exception:  # noqa: BLE001
                pass
        # Check for stage health failures
        if not failed:
            health = d / "stage_health.json"
            if health.exists():
                try:
                    data = json.loads(health.read_text(encoding="utf-8"))
                    if data.get("status") == "FAILED":
                        failed = True
                except Exception:  # noqa: BLE001
                    pass
        # Check for validation report with FAILED status
        if not failed:
            vr = d / "validation_report.md"
            if vr.exists():
                try:
                    content = vr.read_text(encoding="utf-8")
                    if "BLOCKED" in content or "FAILED" in content:
                        failed = True
                except Exception:  # noqa: BLE001
                    pass
        if failed:
            failures += 1
    return failures


================================================
FILE: researchclaw/pipeline/paper_verifier.py
================================================
"""Post-generation paper verification gate.

Extracts all numeric values from a generated LaTeX paper, compares them
against the ``VerifiedRegistry``, and rejects the paper if unverified
numbers appear in strict sections (Results, Experiments, Tables).

This is the **hard, deterministic** defense against fabrication.
"""

from __future__ import annotations

import logging
import math
import re
from dataclasses import dataclass, field
from pathlib import Path

from researchclaw.pipeline.verified_registry import VerifiedRegistry

logger = logging.getLogger(__name__)

# Numbers that are always allowed (years, common constants, etc.)
_ALWAYS_ALLOWED: set[float] = {
    0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0,
    0.5, 0.01, 0.001, 0.0001, 0.1, 0.05, 0.95, 0.99,
    2024.0, 2025.0, 2026.0, 2027.0,
    8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0,
    224.0, 299.0, 384.0,  # Common image sizes
    # BUG-192: Common hyperparameter values
    0.0003, 3e-4, 0.0005, 5e-4, 0.002, 2e-3,  # learning rates
    0.2, 0.3, 0.25, 0.7, 0.6, 0.8,  # clip epsilon, dropout, gradient clip, GCE q, common HP
    0.9, 0.999, 0.9999,  # Adam betas, momentum
    0.02, 0.03,  # weight init std
    1e-5, 1e-6, 1e-8,  # epsilon, weight decay
    300.0, 400.0, 500.0,  # epochs
    4096.0, 8192.0,  # larger batch sizes / hidden dims
}

# Regex for extracting decimal numbers (including negative, scientific notation)
# NOTE: lookbehind/lookahead must NOT exclude { } — numbers inside \textbf{91.5}
# must still be extracted.  We only exclude letters, underscore, and backslash.
_NUMBER_RE = re.compile(
    r"(?<![a-zA-Z_\\])"   # Not preceded by letter, underscore, or backslash
    r"(-?\d+\.?\d*(?:[eE][+-]?\d+)?)"
    r"(?![a-zA-Z_])"      # Not followed by letter or underscore
)

# Section header patterns (LaTeX)
_SECTION_RE = re.compile(
    r"\\(?:section|subsection|subsubsection|paragraph)\*?\{([^}]+)\}",
    re.IGNORECASE,
)

# Patterns to SKIP (numbers inside these are not checked)
_SKIP_PATTERNS = [
    re.compile(r"\\cite\{[^}]*\}"),
    re.compile(r"\\ref\{[^}]*\}"),
    re.compile(r"\\label\{[^}]*\}"),
    re.compile(r"\\bibliographystyle\{[^}]*\}"),
    re.compile(r"\\bibliography\{[^}]*\}"),
    re.compile(r"\\usepackage(?:\[[^\]]*\])?\{[^}]*\}"),
    re.compile(r"\\documentclass(?:\[[^\]]*\])?\{[^}]*\}"),
    re.compile(r"% .*$", re.MULTILINE),  # Comments
    re.compile(r"\\begin\{verbatim\}.*?\\end\{verbatim\}", re.DOTALL),
    re.compile(r"\\begin\{lstlisting\}.*?\\end\{lstlisting\}", re.DOTALL),
    re.compile(r"\\begin\{equation\*?\}.*?\\end\{equation\*?\}", re.DOTALL),
    re.compile(r"\\url\{[^}]*\}"),
    re.compile(r"\\href\{[^}]*\}\{[^}]*\}"),
    re.compile(r"\\includegraphics(?:\[[^\]]*\])?\{[^}]*\}"),
    re.compile(r"\\resizebox\{[^}]*\}\{[^}]*\}"),
    re.compile(r"\\begin\{algorithmic\}.*?\\end\{algorithmic\}", re.DOTALL),
    re.compile(r"\\begin\{algorithm\}.*?\\end\{algorithm\}", re.DOTALL),
]

# Strict sections — unverified numbers cause REJECT
_STRICT_SECTIONS: set[str] = {
    "results",
    "experiments",
    "experimental results",
    "evaluation",
    "ablation",
    "ablation study",
    "quantitative",
    "main results",
    "experimental setup",  # Catches wrong epoch/lr claims
}

# Lenient sections — unverified numbers cause WARNING only
_LENIENT_SECTIONS: set[str] = {
    "introduction",
    "related work",
    "discussion",
    "conclusion",
    "future work",
    "background",
    "preliminaries",
}


@dataclass
class UnverifiedNumber:
    """A number in the paper that doesn't match any verified value."""

    value: float
    line_number: int
    context: str  # Surrounding text (truncated)
    section: str  # Section name
    in_table: bool  # Whether inside a table environment


@dataclass
class FabricatedCondition:
    """A condition name found in paper but not in experiment data."""

    name: str
    line_number: int
    context: str


@dataclass
class VerificationResult:
    """Outcome of paper verification."""

    passed: bool
    severity: str  # "PASS" | "WARN" | "REJECT"
    unverified_numbers: list[UnverifiedNumber] = field(default_factory=list)
    fabricated_conditions: list[FabricatedCondition] = field(default_factory=list)
    strict_violations: int = 0
    lenient_violations: int = 0
    total_numbers_checked: int = 0
    total_numbers_verified: int = 0
    config_warnings: list[str] = field(default_factory=list)
    summary: str = ""

    @property
    def fabrication_rate(self) -> float:
        """Fraction of numbers that are unverified."""
        if self.total_numbers_checked == 0:
            return 0.0
        return len(self.unverified_numbers) / self.total_numbers_checked


def verify_paper(
    tex_text: str,
    registry: VerifiedRegistry,
    *,
    tolerance: float = 0.01,
    strict_sections: set[str] | None = None,
    lenient_sections: set[str] | None = None,
) -> VerificationResult:
    """Verify that all numbers in the paper are grounded in experiment data.

    Parameters
    ----------
    tex_text:
        The full LaTeX source of the paper.
    registry:
        The verified value registry built from experiment data.
    tolerance:
        Relative tolerance for number matching (default 1%).
    strict_sections:
        Section names where unverified numbers cause REJECT.
    lenient_sections:
        Section names where unverified numbers cause WARNING only.

    Returns
    -------
    VerificationResult
        Contains pass/fail status, list of unverified numbers, and summary.
    """
    if strict_sections is None:
        strict_sections = _STRICT_SECTIONS
    if lenient_sections is None:
        lenient_sections = _LENIENT_SECTIONS

    result = VerificationResult(passed=True, severity="PASS")

    # 1. Parse sections
    sections = _parse_sections(tex_text)

    # 2. Find all tables (for in_table flag)
    table_ranges = _find_table_ranges(tex_text)

    # 3. Create skip mask (positions to ignore)
    skip_mask = _build_skip_mask(tex_text)

    # 4. Extract and verify numbers
    lines = tex_text.split("\n")
    for line_idx, line in enumerate(lines):
        line_num = line_idx + 1
        section = _section_at_line(sections, line_idx)
        section_lower = section.lower() if section else ""

        in_table = any(
            start <= line_idx <= end and is_results
            for start, end, is_results in table_ranges
        )

        for m in _NUMBER_RE.finditer(line):
            num_str = m.group(1)
            char_pos = _line_offset(lines, line_idx) + m.start()

            # Skip if inside a skip zone
            if skip_mask[char_pos]:
                continue

            try:
                value = float(num_str)
            except ValueError:
                continue

            if not math.isfinite(value):
                continue

            result.total_numbers_checked += 1

            # Always-allowed numbers
            if value in _ALWAYS_ALLOWED:
                result.total_numbers_verified += 1
                continue

            # Integer-like small numbers (likely indices, counts, etc.)
            # BUG-23 P1: In strict sections or tables, only auto-pass very small
            # integers (≤5) — larger counts (e.g. "20 datasets") could be fabricated.
            is_strict_ctx = _is_strict_section(section_lower, strict_sections) or in_table
            _int_limit = 5 if is_strict_ctx else 20
            if value == int(value) and abs(value) <= _int_limit:
                result.total_numbers_verified += 1
                continue

            # Check against registry
            if registry.is_verified(value, tolerance=tolerance):
                result.total_numbers_verified += 1
                continue

            # UNVERIFIED — classify severity by section
            ctx = line.strip()[:120]
            unv = UnverifiedNumber(
                value=value,
                line_number=line_num,
                context=ctx,
                section=section or "(preamble)",
                in_table=in_table,
            )
            result.unverified_numbers.append(unv)

            is_strict = _is_strict_section(section_lower, strict_sections)
            if is_strict or in_table:
                result.strict_violations += 1
            else:
                result.lenient_violations += 1

    # 5. Check for fabricated conditions
    result.fabricated_conditions = _check_condition_names(tex_text, registry, lines)

    # 5b. BUG-23 P2: Check training config claims (epochs, dataset, etc.)
    result.config_warnings = _check_training_config(tex_text, registry)

    # 6. Determine severity
    if result.strict_violations > 0 or len(result.fabricated_conditions) > 0:
        result.passed = False
        result.severity = "REJECT"
    elif result.lenient_violations > 0:
        result.passed = True
        result.severity = "WARN"
    else:
        result.passed = True
        result.severity = "PASS"

    # 7. Build summary
    result.summary = _build_summary(result)
    logger.info("Paper verification: %s", result.summary)

    return result


def verify_paper_file(
    tex_path: Path,
    registry: VerifiedRegistry,
    **kwargs,
) -> VerificationResult:
    """Convenience: verify from a file path."""
    tex_text = tex_path.read_text(encoding="utf-8")
    return verify_paper(tex_text, registry, **kwargs)


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _parse_sections(tex_text: str) -> list[tuple[int, str]]:
    """Parse section headings and their line positions.

    Returns list of (line_index, section_name) sorted by line_index.
    """
    sections: list[tuple[int, str]] = []
    lines = tex_text.split("\n")
    for i, line in enumerate(lines):
        m = _SECTION_RE.search(line)
        if m:
            sections.append((i, m.group(1).strip()))
    return sections


def _section_at_line(sections: list[tuple[int, str]], line_idx: int) -> str | None:
    """Return the section name that contains the given line."""
    current = None
    for sec_line, sec_name in sections:
        if sec_line <= line_idx:
            current = sec_name
        else:
            break
    return current


_STRICT_EXEMPT_KEYWORDS: set[str] = {
    "dataset", "setup", "protocol", "hyperparameter", "implementation",
    "hardware", "infrastructure", "notation", "preliminaries",
}


def _is_strict_section(section_lower: str, strict_set: set[str]) -> bool:
    """Check if a section name matches any strict section pattern.

    BUG-R49-02: Sections like "Datasets and Evaluation Protocol" contain
    the keyword "evaluation" but describe protocol parameters, not results.
    Such sections are exempted when they also contain a setup/protocol keyword.
    """
    if not section_lower:
        return False
    for strict_name in strict_set:
        if strict_name in section_lower:
            # Check for exemption: if the section also contains a
            # setup/protocol keyword, it's not a results section.
            if any(kw in section_lower for kw in _STRICT_EXEMPT_KEYWORDS):
                return False
            return True
    return False


def _find_table_ranges(tex_text: str) -> list[tuple[int, int, bool]]:
    """Find line ranges of table environments.

    Returns ``(start_line, end_line, is_results_table)`` tuples.
    Hyperparameter / configuration tables (detected by ``\\caption`` keywords)
    are marked ``is_results_table=False`` so the verifier skips strict checks
    on their numeric content (BUG-192).
    """
    _HP_CAPTION_KW = {
        "hyperparameter", "hyper-parameter", "configuration", "config",
        "setting", "training detail", "implementation detail",
    }
    ranges: list[tuple[int, int, bool]] = []
    lines = tex_text.split("\n")
    in_table = False
    start = 0
    for i, line in enumerate(lines):
        if r"\begin{table" in line:
            in_table = True
            start = i
        elif r"\end{table" in line and in_table:
            # Scan table block for \caption to determine type
            table_block = "\n".join(lines[start : i + 1]).lower()
            is_hp = any(kw in table_block for kw in _HP_CAPTION_KW)
            ranges.append((start, i, not is_hp))
            in_table = False
    return ranges


def _build_skip_mask(tex_text: str) -> list[bool]:
    """Build a per-character boolean mask of positions to skip."""
    mask = [False] * len(tex_text)
    for pattern in _SKIP_PATTERNS:
        for m in pattern.finditer(tex_text):
            for pos in range(m.start(), m.end()):
                if pos < len(mask):
                    mask[pos] = True
    return mask


def _line_offset(lines: list[str], line_idx: int) -> int:
    """Return the character offset of the start of a line."""
    offset = 0
    for i in range(line_idx):
        offset += len(lines[i]) + 1  # +1 for newline
    return offset


def _check_condition_names(
    tex_text: str,
    registry: VerifiedRegistry,
    lines: list[str],
) -> list[FabricatedCondition]:
    """Check if the paper mentions condition names that never ran."""
    fabricated: list[FabricatedCondition] = []

    # Only check if we have known conditions
    if not registry.condition_names:
        return fabricated

    # Build pattern of known condition names (exact match in text)
    # Look for condition-like names that appear in tables or bold text
    # This is heuristic — we look for unknown names that look like conditions
    known_lower = {name.lower() for name in registry.condition_names}

    # Common generic terms that should NOT be flagged as fabricated conditions
    _GENERIC_TERMS = {
        "method", "metric", "condition", "---", "",
        "model", "approach", "variant", "architecture",
        "ours", "average", "mean", "std", "total",
        "baseline", "proposed", "ablation", "default",
        "results", "table", "figure", "section",
    }

    def _is_candidate(name: str) -> bool:
        """Check if a cleaned name looks like a real condition name."""
        return bool(
            name
            and name.lower() not in known_lower
            and name.lower() not in _GENERIC_TERMS
            and not name.startswith("\\")
            and len(name) > 1
            and not name.isdigit()
            # BUG-DA8-15: Reject numeric-looking strings (e.g. "91.5" from \textbf{91.5})
            and not re.match(r'^[\d.eE+\-]+$', name)
        )

    def _clean_latex(s: str) -> str:
        s = re.sub(r"\\textbf\{([^}]*)\}", r"\1", s)
        s = re.sub(r"\\textit\{([^}]*)\}", r"\1", s)
        return s.replace("\\_", "_").strip()

    _seen_names: set[str] = set()

    # 1. Extract potential condition names from TABLE ROWS
    for i, line in enumerate(lines):
        if "&" in line and "\\\\" in line:
            cells = line.split("&")
            if cells:
                cand_clean = _clean_latex(cells[0].strip().rstrip("\\").strip())
                if _is_candidate(cand_clean) and cand_clean.lower() not in _seen_names:
                    _seen_names.add(cand_clean.lower())
                    fabricated.append(
                        FabricatedCondition(
                            name=cand_clean,
                            line_number=i + 1,
                            context=line.strip()[:120],
                        )
                    )

    # 2. BUG-23 P2: Also check PROSE — bold/italic condition mentions in
    #    Results/Experiments sections that don't match known conditions.
    _strict_sections_lower = {
        "results", "experiments", "experimental results",
        "evaluation", "ablation", "comparison",
    }
    sections = _parse_sections(tex_text)
    for i, line in enumerate(lines):
        section = _section_at_line(sections, i)
        if not section or section.lower() not in _strict_sections_lower:
            continue
        # Find \textbf{CondName} or \textit{CondName} in prose
        for m in re.finditer(r"\\text(?:bf|it)\{([^}]+)\}", line):
            cand_clean = _clean_latex(m.group(1)).strip()
            # Only flag multi-word or snake_case names that look like conditions
            if (
                _is_candidate(cand_clean)
                and ("_" in cand_clean or cand_clean[0].isupper())
                and cand_clean.lower() not in _seen_names
            ):
                _seen_names.add(cand_clean.lower())
                fabricated.append(
                    FabricatedCondition(
                        name=cand_clean,
                        line_number=i + 1,
                        context=line.strip()[:120],
                    )
                )

    return fabricated


def _check_training_config(
    tex_text: str,
    registry: VerifiedRegistry,
) -> list[str]:
    """BUG-23 P2: Check if paper claims about training config match reality.

    Extracts epoch counts from paper text and compares against known
    training_config from the registry. Returns list of warning strings.
    """
    warnings: list[str] = []

    # Extract "trained for N epochs" or "N epochs" claims
    epoch_claims = re.findall(
        r"(?:trained?\s+(?:for\s+)?|over\s+|(?:for|with)\s+)(\d+)\s+epoch",
        tex_text,
        re.IGNORECASE,
    )
    if epoch_claims and registry.training_config:
        actual_steps = registry.training_config.get("TRAINING_STEPS")
        actual_epochs = registry.training_config.get("epochs")
        if actual_epochs is not None:
            for claim in epoch_claims:
                claimed = int(claim)
                if abs(claimed - actual_epochs) > max(5, actual_epochs * 0.3):
                    warnings.append(
                        f"Paper claims {claimed} epochs but experiment ran {int(actual_epochs)} epochs"
                    )
        elif actual_steps is not None:
            # Can't compare epochs to steps directly, but flag very large claims
            for claim in epoch_claims:
                claimed = int(claim)
                if claimed > 500:
                    warnings.append(
                        f"Paper claims {claimed} epochs — verify against actual training steps ({int(actual_steps)})"
                    )

    # Check condition count claims ("N conditions" / "N methods" / "N baselines")
    count_claims = re.findall(
        r"(\d+)\s+(?:condition|method|baseline|approach|variant)s?\b",
        tex_text,
        re.IGNORECASE,
    )
    if count_claims and registry.condition_names:
        actual_count = len(registry.condition_names)
        for claim in count_claims:
            claimed = int(claim)
            if claimed > actual_count + 1:
                warnings.append(
                    f"Paper claims {claimed} conditions/methods but only {actual_count} ran"
                )

    if warnings:
        logger.warning("Training config validation: %s", warnings)
    return warnings


def _build_summary(result: VerificationResult) -> str:
    """Build human-readable summary."""
    parts = [f"severity={result.severity}"]
    parts.append(
        f"checked={result.total_numbers_checked}, "
        f"verified={result.total_numbers_verified}, "
        f"unverified={len(result.unverified_numbers)}"
    )
    if result.strict_violations:
        parts.append(f"strict_violations={result.strict_violations}")
    if result.fabricated_conditions:
        names = [fc.name for fc in result.fabricated_conditions[:3]]
        parts.append(f"fabricated_conditions={names}")
    if result.config_warnings:
        parts.append(f"config_warnings={len(result.config_warnings)}")
    return "; ".join(parts)


================================================
FILE: researchclaw/pipeline/runner.py
================================================
from __future__ import annotations

import json
import importlib
import logging
import os
import shutil
import tempfile
import time as _time
from pathlib import Path

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.evolution import EvolutionStore, extract_lessons
from researchclaw.knowledge.base import write_stage_to_kb
from researchclaw.pipeline.executor import StageResult, execute_stage
from researchclaw.pipeline.stages import (
    DECISION_ROLLBACK,
    MAX_DECISION_PIVOTS,
    NONCRITICAL_STAGES,
    STAGE_SEQUENCE,
    Stage,
    StageStatus,
)


def _utcnow_iso() -> str:
    from datetime import datetime, timezone

    return datetime.now(timezone.utc).isoformat(timespec="seconds")


def _should_start(stage: Stage, from_stage: Stage, started: bool) -> bool:
    if started:
        return True
    return stage == from_stage


def _build_pipeline_summary(
    *,
    run_id: str,
    results: list[StageResult],
    from_stage: Stage,
    run_dir: Path | None = None,
) -> dict[str, object]:
    summary: dict[str, object] = {
        "run_id": run_id,
        "stages_executed": len(results),
        "stages_done": sum(1 for item in results if item.status == StageStatus.DONE),
        "stages_blocked": sum(
            1 for item in results if item.status == StageStatus.BLOCKED_APPROVAL
        ),
        "stages_failed": sum(
            1 for item in results if item.status == StageStatus.FAILED
        ),
        "degraded": any(r.decision == "degraded" for r in results),
        "from_stage": int(from_stage),
        "final_stage": int(results[-1].stage) if results else int(from_stage),
        "final_status": results[-1].status.value if results else "no_stages",
        "generated": _utcnow_iso(),
        "content_metrics": _collect_content_metrics(run_dir),
    }
    return summary


def _write_pipeline_summary(run_dir: Path, summary: dict[str, object]) -> None:
    (run_dir / "pipeline_summary.json").write_text(
        json.dumps(summary, indent=2),
        encoding="utf-8",
    )


def _write_checkpoint(run_dir: Path, stage: Stage, run_id: str) -> None:
    """Write checkpoint atomically via temp file + rename to prevent corruption."""
    checkpoint = {
        "last_completed_stage": int(stage),
        "last_completed_name": stage.name,
        "run_id": run_id,
        "timestamp": _utcnow_iso(),
    }
    target = run_dir / "checkpoint.json"
    fd, tmp_path = tempfile.mkstemp(dir=run_dir, suffix=".tmp", prefix="checkpoint_")
    os.close(fd)
    try:
        with open(tmp_path, "w", encoding="utf-8") as fh:
            fh.write(json.dumps(checkpoint, indent=2))
        Path(tmp_path).replace(target)
    except BaseException:
        Path(tmp_path).unlink(missing_ok=True)
        raise


def _write_heartbeat(run_dir: Path, stage: Stage, run_id: str) -> None:
    """Write heartbeat file for sentinel watchdog monitoring."""
    import os

    heartbeat = {
        "pid": os.getpid(),
        "last_stage": int(stage),
        "last_stage_name": stage.name,
        "run_id": run_id,
        "timestamp": _utcnow_iso(),
    }
    (run_dir / "heartbeat.json").write_text(
        json.dumps(heartbeat, indent=2), encoding="utf-8"
    )


def read_checkpoint(run_dir: Path) -> Stage | None:
    """Read checkpoint and return the NEXT stage to execute, or None if no checkpoint."""
    cp_path = run_dir / "checkpoint.json"
    if not cp_path.exists():
        return None
    try:
        data = json.loads(cp_path.read_text(encoding="utf-8"))
        last_num = data.get("last_completed_stage")
        if last_num is None:
            return None
        for i, stage in enumerate(STAGE_SEQUENCE):
            if int(stage) == last_num:
                if i + 1 < len(STAGE_SEQUENCE):
                    return STAGE_SEQUENCE[i + 1]
                return None
        return None
    except (json.JSONDecodeError, TypeError, ValueError):
        return None


def resume_from_checkpoint(
    run_dir: Path, default_stage: Stage = Stage.TOPIC_INIT
) -> Stage:
    """Resolve the stage to resume from using checkpoint metadata."""
    next_stage = read_checkpoint(run_dir)
    return next_stage if next_stage is not None else default_stage


def _collect_content_metrics(run_dir: Path | None) -> dict[str, object]:
    """Collect content authenticity metrics from stage outputs."""
    metrics: dict[str, object] = {
        "template_ratio": None,
        "citation_verify_score": None,
        "total_citations": None,
        "verified_citations": None,
        "degraded_sources": [],
    }
    if run_dir is None:
        return metrics

    draft_path = run_dir / "stage-17" / "paper_draft.md"
    if draft_path.exists():
        try:
            quality_module = importlib.import_module("researchclaw.quality")
            compute_template_ratio = quality_module.compute_template_ratio
            text = draft_path.read_text(encoding="utf-8")
            metrics["template_ratio"] = round(compute_template_ratio(text), 4)
        except (
            AttributeError,
            ModuleNotFoundError,
            UnicodeDecodeError,
            OSError,
            ValueError,
            TypeError,
        ):
            pass

    verify_path = run_dir / "stage-23" / "verification_report.json"
    if verify_path.exists():
        try:
            vdata = json.loads(verify_path.read_text(encoding="utf-8"))
            if isinstance(vdata, dict):
                summary = vdata.get("summary", vdata)
                total = summary.get("total", 0) if isinstance(summary, dict) else None
                verified = summary.get("verified", 0) if isinstance(summary, dict) else None
                if isinstance(total, int | float) and isinstance(verified, int | float):
                    total_num = int(total)
                    verified_num = int(verified)
                    metrics["total_citations"] = total_num
                    metrics["verified_citations"] = verified_num
                    if total_num > 0:
                        metrics["citation_verify_score"] = round(
                            verified_num / total_num, 4
                        )
        except (json.JSONDecodeError, OSError, TypeError, ValueError):
            pass

    return metrics


logger = logging.getLogger(__name__)


def _run_experiment_diagnosis(run_dir: Path, config: RCConfig, run_id: str) -> None:
    """Run experiment diagnosis after Stage 14 and save reports.

    Produces:
    - ``run_dir/experiment_diagnosis.json`` — structured diagnosis + quality assessment
    - ``run_dir/repair_prompt.txt`` — repair instructions (if quality is insufficient)
    """
    try:
        from researchclaw.pipeline.experiment_diagnosis import (
            diagnose_experiment,
            assess_experiment_quality,
        )

        # Find the most recent stage-14 experiment_summary.json
        summary_path = None
        for candidate in sorted(run_dir.glob("stage-14*/experiment_summary.json")):
            summary_path = candidate
        if not summary_path or not summary_path.exists():
            return

        summary = json.loads(summary_path.read_text(encoding="utf-8"))

        # Collect stdout/stderr from experiment runs
        stdout, stderr = "", ""
        runs_dir = summary_path.parent / "runs"
        if runs_dir.is_dir():
            for run_file in sorted(runs_dir.glob("*.json"))[:5]:
                try:
                    run_data = json.loads(run_file.read_text(encoding="utf-8"))
                    if isinstance(run_data, dict):
                        stdout += run_data.get("stdout", "") + "\n"
                        stderr += run_data.get("stderr", "") + "\n"
                except (json.JSONDecodeError, OSError):
                    continue

        # Load experiment plan from stage-09
        plan = None
        for candidate in sorted(run_dir.glob("stage-09*/exp_plan.yaml")):
            try:
                import yaml as _yaml_diag
                plan = _yaml_diag.safe_load(candidate.read_text(encoding="utf-8"))
            except Exception:
                pass
        if plan is None:
            for candidate in sorted(run_dir.glob("stage-09*/experiment_design.json")):
                try:
                    plan = json.loads(candidate.read_text(encoding="utf-8"))
                except (json.JSONDecodeError, OSError):
                    pass

        # Load refinement log if available
        ref_log = None
        for candidate in sorted(run_dir.glob("stage-13*/refinement_log.json")):
            try:
                ref_log = json.loads(candidate.read_text(encoding="utf-8"))
            except (json.JSONDecodeError, OSError):
                pass

        # Run diagnosis
        diag = diagnose_experiment(
            experiment_summary=summary,
            experiment_plan=plan,
            refinement_log=ref_log,
            stdout=stdout.strip(),
            stderr=stderr.strip(),
        )

        # Run quality assessment
        qa = assess_experiment_quality(summary, ref_log)

        # Save diagnosis report
        diag_report = {
            "diagnosis": diag.to_dict(),
            "quality_assessment": {
                "mode": qa.mode.value,
                "sufficient": qa.sufficient,
                "repair_possible": qa.repair_possible,
                "deficiency_types": [d.type.value for d in qa.deficiencies],
            },
            "repair_needed": not qa.sufficient,
            "generated": _utcnow_iso(),
        }
        (run_dir / "experiment_diagnosis.json").write_text(
            json.dumps(diag_report, indent=2), encoding="utf-8"
        )

        if not qa.sufficient:
            # Generate repair prompt for the REFINE loop
            from researchclaw.pipeline.experiment_repair import build_repair_prompt

            code: dict[str, str] = {}
            # Try refined code first, then stage-10 experiment dir, then raw stage-10
            for _glob_pat in (
                "stage-13*/experiment_final/*.py",
                "stage-10*/experiment/*.py",
                "stage-10*/*.py",
            ):
                for candidate in sorted(run_dir.glob(_glob_pat)):
                    try:
                        code[candidate.name] = candidate.read_text(encoding="utf-8")
                    except (OSError, UnicodeDecodeError):
                        pass
                if code:
                    break

            repair_prompt = build_repair_prompt(
                diag, code, time_budget_sec=config.experiment.time_budget_sec
            )
            (run_dir / "repair_prompt.txt").write_text(
                repair_prompt, encoding="utf-8"
            )
            logger.info(
                "[%s] Experiment diagnosis: mode=%s, deficiencies=%d — repair prompt saved",
                run_id, qa.mode.value, len(diag.deficiencies),
            )
            print(
                f"[{run_id}] Experiment diagnosis: {qa.mode.value} "
                f"({len(diag.deficiencies)} issues found, repair needed)"
            )
        else:
            logger.info(
                "[%s] Experiment diagnosis: mode=%s, sufficient=True — quality OK",
                run_id, qa.mode.value,
            )
            print(f"[{run_id}] Experiment diagnosis: {qa.mode.value} — quality OK")

    except Exception as exc:
        logger.warning("Experiment diagnosis failed: %s", exc)


def _run_experiment_repair(run_dir: Path, config: RCConfig, run_id: str) -> None:
    """Execute the experiment repair loop when diagnosis finds quality issues.

    Calls the repair loop from ``experiment_repair.py`` which:
    1. Loads experiment code and diagnosis
    2. Gets fixes from LLM or OpenCode
    3. Re-runs experiment in sandbox
    4. Re-assesses quality
    5. Repeats up to max_cycles
    """
    try:
        from researchclaw.pipeline.experiment_repair import run_repair_loop

        repair_result = run_repair_loop(
            run_dir=run_dir,
            config=config,
            run_id=run_id,
        )

        # Save repair result
        (run_dir / "experiment_repair_result.json").write_text(
            json.dumps(repair_result.to_dict(), indent=2), encoding="utf-8"
        )

        # BUG-186: Promote best experiment summary to stage-14/ so
        # downstream stages (sanitizer, paper_verifier) see it.
        # BUG-198: Only promote if the repair summary is RICHER than
        # the existing stage-14 summary.  The repair loop can produce
        # empty summaries (metrics: {}, 0 conditions) which would
        # overwrite enriched data from the analysis stage.
        if repair_result.best_experiment_summary:
            from researchclaw.pipeline.experiment_repair import (
                _summary_quality_score,
            )

            best_path = run_dir / "stage-14" / "experiment_summary.json"
            existing_score = 0.0
            if best_path.exists():
                try:
                    existing = json.loads(
                        best_path.read_text(encoding="utf-8")
                    )
                    existing_score = _summary_quality_score(existing)
                except (json.JSONDecodeError, OSError):
                    pass

            repair_score = _summary_quality_score(
                repair_result.best_experiment_summary
            )

            if repair_score > existing_score:
                best_path.write_text(
                    json.dumps(
                        repair_result.best_experiment_summary, indent=2
                    ),
                    encoding="utf-8",
                )
                logger.info(
                    "[%s] Promoted repair results to stage-14 "
                    "(score %.1f > %.1f, success=%s)",
                    run_id, repair_score, existing_score,
                    repair_result.success,
                )
            else:
                logger.info(
                    "[%s] Kept existing stage-14 summary (score %.1f >= "
                    "repair score %.1f)",
                    run_id, existing_score, repair_score,
                )

        if repair_result.success:
            # Re-run diagnosis with updated results
            _run_experiment_diagnosis(run_dir, config, run_id)
        else:
            logger.info(
                "[%s] Repair loop completed without reaching full_paper quality "
                "(best mode: %s, %d cycles)",
                run_id, repair_result.final_mode.value, repair_result.total_cycles,
            )

    except Exception as exc:
        logger.warning("[%s] Experiment repair failed: %s", run_id, exc)
        print(f"[{run_id}] Experiment repair failed: {exc}")


def execute_pipeline(
    *,
    run_dir: Path,
    run_id: str,
    config: RCConfig,
    adapters: AdapterBundle,
    from_stage: Stage = Stage.TOPIC_INIT,
    auto_approve_gates: bool = False,
    stop_on_gate: bool = False,
    skip_noncritical: bool = False,
    kb_root: Path | None = None,
) -> list[StageResult]:
    """Execute pipeline stages sequentially from `from_stage` and write summary."""

    results: list[StageResult] = []
    started = False
    total_stages = len(STAGE_SEQUENCE)

    for stage in STAGE_SEQUENCE:
        started = _should_start(stage, from_stage, started)
        if not started:
            continue

        stage_num = int(stage)
        prefix = f"[{run_id}] Stage {stage_num:02d}/{total_stages}"
        print(f"{prefix} {stage.name} — running...")

        # BUG-218: Ensure the best stage-14 experiment data is promoted
        # BEFORE paper writing begins.  Without this, the recursive REFINE
        # path writes the paper using the latest (potentially worse)
        # iteration's data, because the post-recursion promotion at line
        # ~547 runs only after the recursive call—i.e. after the paper
        # has already been written.
        if stage == Stage.PAPER_OUTLINE:
            _promote_best_stage14(run_dir, config)

        t0 = _time.monotonic()

        result = execute_stage(
            stage,
            run_dir=run_dir,
            run_id=run_id,
            config=config,
            adapters=adapters,
            auto_approve_gates=auto_approve_gates,
        )
        elapsed = _time.monotonic() - t0
        if result.status == StageStatus.DONE:
            arts = ", ".join(result.artifacts) if result.artifacts else "none"
            if result.decision == "degraded":
                print(
                    f"{prefix} {stage.name} — DEGRADED ({elapsed:.1f}s) "
                    f"— continuing with sanitization → {arts}"
                )
            else:
                print(f"{prefix} {stage.name} — done ({elapsed:.1f}s) → {arts}")
        elif result.status == StageStatus.FAILED:
            err = result.error or "unknown error"
            print(f"{prefix} {stage.name} — FAILED ({elapsed:.1f}s) — {err}")
        elif result.status == StageStatus.BLOCKED_APPROVAL:
            print(f"{prefix} {stage.name} — blocked (awaiting approval)")
        results.append(result)

        if kb_root is not None and result.status == StageStatus.DONE:
            try:
                stage_dir = run_dir / f"stage-{int(stage):02d}"
                write_stage_to_kb(
                    kb_root,
                    stage_id=int(stage),
                    stage_name=stage.name.lower(),
                    run_id=run_id,
                    artifacts=list(result.artifacts),
                    stage_dir=stage_dir,
                    backend=config.knowledge_base.backend,
                    topic=config.research.topic,
                )
            except Exception:  # noqa: BLE001
                pass

        if result.status == StageStatus.DONE:
            _write_checkpoint(run_dir, stage, run_id)

        # --- Experiment diagnosis + repair after Stage 14 (result_analysis) ---
        if (
            stage == Stage.RESULT_ANALYSIS
            and result.status == StageStatus.DONE
            and config.experiment.repair.enabled
        ):
            _run_experiment_diagnosis(run_dir, config, run_id)

            # Check if repair loop should run
            _diag_path = run_dir / "experiment_diagnosis.json"
            if _diag_path.exists():
                try:
                    _diag_data = json.loads(_diag_path.read_text(encoding="utf-8"))
                    if _diag_data.get("repair_needed"):
                        _run_experiment_repair(run_dir, config, run_id)
                except (json.JSONDecodeError, OSError):
                    pass

        # --- Heartbeat for sentinel watchdog ---
        _write_heartbeat(run_dir, stage, run_id)

        # --- PIVOT/REFINE decision handling ---
        if (
            stage == Stage.RESEARCH_DECISION
            and result.status == StageStatus.DONE
            and result.decision in DECISION_ROLLBACK
        ):
            pivot_count = _read_pivot_count(run_dir)
            # R6-4: Skip REFINE if experiment metrics are empty for consecutive cycles
            if pivot_count > 0 and _consecutive_empty_metrics(run_dir, pivot_count):
                logger.warning(
                    "Consecutive REFINE cycles produced empty metrics — forcing PROCEED"
                )
                print(
                    f"[{run_id}] Consecutive empty metrics across REFINE cycles — forcing PROCEED"
                )
                # BUG-211: Promote best stage-14 before proceeding with
                # empty data — an earlier iteration may have real metrics.
                _promote_best_stage14(run_dir, config)
            elif pivot_count < MAX_DECISION_PIVOTS:
                rollback_target = DECISION_ROLLBACK[result.decision]
                _record_decision_history(
                    run_dir, result.decision, rollback_target, pivot_count + 1
                )
                logger.info(
                    "Decision %s: rolling back to %s (attempt %d/%d)",
                    result.decision.upper(),
                    rollback_target.name,
                    pivot_count + 1,
                    MAX_DECISION_PIVOTS,
                )
                print(
                    f"[{run_id}] Decision: {result.decision.upper()} → "
                    f"rollback to {rollback_target.name} "
                    f"(attempt {pivot_count + 1}/{MAX_DECISION_PIVOTS})"
                )
                # Version existing stage directories before overwriting
                _version_rollback_stages(
                    run_dir, rollback_target, pivot_count + 1
                )
                # Recurse from rollback target
                pivot_results = execute_pipeline(
                    run_dir=run_dir,
                    run_id=run_id,
                    config=config,
                    adapters=adapters,
                    from_stage=rollback_target,
                    auto_approve_gates=auto_approve_gates,
                    stop_on_gate=stop_on_gate,
                    skip_noncritical=skip_noncritical,
                    kb_root=kb_root,
                )
                results.extend(pivot_results)
                # BUG-211: Promote best stage-14 after REFINE completes so
                # downstream stages use the best data, not just the latest.
                _promote_best_stage14(run_dir, config)
                break  # Exit current loop; recursive call handles the rest
            else:
                # Quality gate: check if experiment results are actually usable
                _quality_ok, _quality_msg = _check_experiment_quality(
                    run_dir, pivot_count
                )
                if not _quality_ok:
                    logger.warning(
                        "Max pivot attempts (%d) reached — forcing PROCEED "
                        "with quality warning: %s",
                        MAX_DECISION_PIVOTS,
                        _quality_msg,
                    )
                    print(
                        f"[{run_id}] QUALITY WARNING: {_quality_msg}"
                    )
                    # Write quality warning to run directory
                    _qw_path = run_dir / "quality_warning.txt"
                    _qw_path.write_text(
                        f"Max pivots ({MAX_DECISION_PIVOTS}) reached.\n"
                        f"Quality gate failed: {_quality_msg}\n"
                        f"Paper will be written but may have significant issues.\n",
                        encoding="utf-8",
                    )
                else:
                    logger.warning(
                        "Max pivot attempts (%d) reached — forcing PROCEED",
                        MAX_DECISION_PIVOTS,
                    )
                print(
                    f"[{run_id}] Max pivot attempts reached — forcing PROCEED"
                )

                # BUG-205: After forced PROCEED, promote the BEST stage-14
                # experiment summary across all REFINE iterations.
                _promote_best_stage14(run_dir, config)

        if result.status == StageStatus.FAILED:
            if skip_noncritical and stage in NONCRITICAL_STAGES:
                logger.warning("Noncritical stage %s failed - skipping", stage.name)
            else:
                break
        if result.status == StageStatus.BLOCKED_APPROVAL and stop_on_gate:
            break

    summary = _build_pipeline_summary(
        run_id=run_id,
        results=results,
        from_stage=from_stage,
        run_dir=run_dir,
    )
    _write_pipeline_summary(run_dir, summary)

    # --- Evolution: extract and store lessons ---
    lessons: list[object] = []
    try:
        lessons = extract_lessons(results, run_id=run_id, run_dir=run_dir)
        if lessons:
            store = EvolutionStore(run_dir / "evolution")
            store.append_many(lessons)
            logger.info("Extracted %d lessons from pipeline run", len(lessons))
    except Exception:  # noqa: BLE001
        logger.warning("Evolution lesson extraction failed (non-blocking)")

    # --- MetaClaw bridge: convert high-severity lessons to skills ---
    try:
        _metaclaw_post_pipeline(config, results, lessons, run_id, run_dir)
    except Exception:  # noqa: BLE001
        logger.warning("MetaClaw post-pipeline hook failed (non-blocking)")

    # --- Package deliverables into a single folder ---
    try:
        deliverables_dir = _package_deliverables(run_dir, run_id, config)
        if deliverables_dir is not None:
            print(f"[{run_id}] Deliverables packaged → {deliverables_dir}")
    except Exception:  # noqa: BLE001
        logger.warning("Deliverables packaging failed (non-blocking)")

    return results


def _package_deliverables(
    run_dir: Path,
    run_id: str,
    config: RCConfig,
) -> Path | None:
    """Collect all final user-facing deliverables into a single ``deliverables/`` folder.

    Returns the deliverables directory path, or None if nothing was packaged.

    Packaged artifacts (best-available version selected automatically):
    - paper_final.md          — Final paper (Markdown)
    - paper.tex               — Conference-ready LaTeX
    - references.bib          — BibTeX bibliography
    - code/                   — Experiment code package
    - verification_report.json — Citation verification report (if available)
    """
    dest = run_dir / "deliverables"
    dest.mkdir(parents=True, exist_ok=True)

    packaged: list[str] = []

    # --- 1. Final paper (Markdown) ---
    # Prefer verified version (stage 23) over base version (stage 22)
    paper_md = None
    for candidate in [
        run_dir / "stage-23" / "paper_final_verified.md",
        run_dir / "stage-22" / "paper_final.md",
    ]:
        if candidate.exists() and candidate.stat().st_size > 0:
            paper_md = candidate
            break
    if paper_md is not None:
        shutil.copy2(paper_md, dest / "paper_final.md")
        packaged.append("paper_final.md")

    # --- 2. LaTeX paper ---
    # BUG-183: Stage 22's paper.tex has been sanitized (fabricated numbers
    # replaced with ---).  Regenerating from Markdown would undo this because
    # the Markdown was never sanitized.  Prefer Stage-22 paper.tex when a
    # sanitization report exists.  Only regenerate from verified Markdown if
    # no sanitization was performed (i.e., the run was clean).
    tex_regenerated = False
    _sanitization_report = run_dir / "stage-22" / "sanitization_report.json"
    _was_sanitized = _sanitization_report.exists()
    verified_md = run_dir / "stage-23" / "paper_final_verified.md"
    if (
        not _was_sanitized
        and paper_md is not None
        and paper_md == verified_md
        and verified_md.exists()
        and verified_md.stat().st_size > 0
    ):
        try:
            from researchclaw.templates import get_template, markdown_to_latex
            from researchclaw.pipeline.executor import _extract_paper_title

            tpl = get_template(config.export.target_conference)
            v_text = verified_md.read_text(encoding="utf-8")
            tex_content = markdown_to_latex(
                v_text,
                tpl,
                title=_extract_paper_title(v_text),
                authors=config.export.authors,
                bib_file=config.export.bib_file,
            )
            # IMP-17: Quality check — ensure regenerated LaTeX has
            # proper structure (abstract, multiple sections)
            _has_abstract = (
                "\\begin{abstract}" in tex_content
                and tex_content.split("\\begin{abstract}")[1]
                .split("\\end{abstract}")[0]
                .strip()
            )
            _section_count = tex_content.count("\\section{")
            if _has_abstract and _section_count >= 3:
                (dest / "paper.tex").write_text(tex_content, encoding="utf-8")
                packaged.append("paper.tex")
                tex_regenerated = True
                logger.info(
                    "Deliverables: regenerated paper.tex from verified markdown"
                )
            else:
                logger.warning(
                    "Regenerated paper.tex has poor structure "
                    "(abstract=%s, sections=%d) — using Stage 22 version",
                    bool(_has_abstract),
                    _section_count,
                )
        except Exception:  # noqa: BLE001
            logger.debug("paper.tex regeneration from verified md failed")
    elif _was_sanitized:
        logger.info(
            "Deliverables: using Stage 22 paper.tex (sanitized) — "
            "skipping markdown regeneration to preserve sanitization"
        )

    if not tex_regenerated:
        tex_src = run_dir / "stage-22" / "paper.tex"
        if tex_src.exists() and tex_src.stat().st_size > 0:
            shutil.copy2(tex_src, dest / "paper.tex")
            packaged.append("paper.tex")

    # --- 3. References (BibTeX) ---
    # Prefer verified bib (stage 23) over base bib (stage 22)
    bib_src = None
    for candidate in [
        run_dir / "stage-23" / "references_verified.bib",
        run_dir / "stage-22" / "references.bib",
    ]:
        if candidate.exists() and candidate.stat().st_size > 0:
            bib_src = candidate
            break
    if bib_src is not None:
        shutil.copy2(bib_src, dest / "references.bib")
        packaged.append("references.bib")

    # --- 4. Experiment code package ---
    code_src = run_dir / "stage-22" / "code"
    if code_src.is_dir():
        code_dest = dest / "code"
        if code_dest.exists():
            shutil.rmtree(code_dest)
        shutil.copytree(code_src, code_dest)
        packaged.append("code/")

    # --- 5. Verification report (optional) ---
    verify_src = run_dir / "stage-23" / "verification_report.json"
    if verify_src.exists() and verify_src.stat().st_size > 0:
        shutil.copy2(verify_src, dest / "verification_report.json")
        packaged.append("verification_report.json")

    # --- 5b. Sanitization report (degraded mode) ---
    san_src = run_dir / "stage-22" / "sanitization_report.json"
    if san_src.exists() and san_src.stat().st_size > 0:
        shutil.copy2(san_src, dest / "sanitization_report.json")
        packaged.append("sanitization_report.json")

    # --- 6. Charts (optional) ---
    charts_src = run_dir / "stage-22" / "charts"
    if charts_src.is_dir() and any(charts_src.iterdir()):
        charts_dest = dest / "charts"
        if charts_dest.exists():
            shutil.rmtree(charts_dest)
        shutil.copytree(charts_src, charts_dest)
        packaged.append("charts/")

    # --- 7. Conference style files (.sty, .bst) ---
    try:
        from researchclaw.templates import get_template

        tpl = get_template(config.export.target_conference)
        style_files = tpl.get_style_files()
        for sf in style_files:
            shutil.copy2(sf, dest / sf.name)
            packaged.append(sf.name)
        if style_files:
            logger.info(
                "Deliverables: bundled %d style files for %s",
                len(style_files),
                tpl.display_name,
            )
    except Exception:  # noqa: BLE001
        logger.debug("Style file bundling skipped (template lookup failed)")

    # --- 8. Verify & repair cite key coverage (IMP-12 + IMP-14) ---
    tex_path = dest / "paper.tex"
    bib_path = dest / "references.bib"
    if tex_path.exists() and bib_path.exists():
        try:
            tex_text = tex_path.read_text(encoding="utf-8")
            bib_text = bib_path.read_text(encoding="utf-8")
            import re as _re

            # IMP-15: Deduplicate .bib entries
            _seen_bib_keys: set[str] = set()
            _deduped_entries: list[str] = []
            for _bm in _re.finditer(
                r"(@\w+\{([^,]+),.*?\n\})", bib_text, _re.DOTALL
            ):
                _bkey = _bm.group(2).strip()
                if _bkey not in _seen_bib_keys:
                    _seen_bib_keys.add(_bkey)
                    _deduped_entries.append(_bm.group(1))
            if len(_deduped_entries) < len(
                list(_re.finditer(r"@\w+\{", bib_text))
            ):
                bib_text = "\n\n".join(_deduped_entries) + "\n"
                bib_path.write_text(bib_text, encoding="utf-8")
                logger.info(
                    "Deliverables: deduplicated .bib → %d entries",
                    len(_deduped_entries),
                )

            # Collect all cite keys from \cite{key1, key2}
            all_cite_keys: set[str] = set()
            for cm in _re.finditer(r"\\cite\{([^}]+)\}", tex_text):
                all_cite_keys.update(k.strip() for k in cm.group(1).split(","))
            bib_keys = set(_re.findall(r"@\w+\{([^,]+),", bib_text))
            missing = all_cite_keys - bib_keys

            # IMP-14: Strip orphaned \cite{key} from paper.tex
            if missing:
                logger.warning(
                    "Deliverables: stripping %d orphaned cite keys from "
                    "paper.tex: %s",
                    len(missing),
                    sorted(missing)[:10],
                )

                def _filter_cite(m: _re.Match[str]) -> str:
                    keys = [k.strip() for k in m.group(1).split(",")]
                    kept = [k for k in keys if k not in missing]
                    if not kept:
                        return ""
                    return "\\cite{" + ", ".join(kept) + "}"

                tex_text = _re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text)
                # Clean up whitespace artifacts: double spaces, space before period
                tex_text = _re.sub(r"  +", " ", tex_text)
                tex_text = _re.sub(r" ([.,;:)])", r"\1", tex_text)
                tex_path.write_text(tex_text, encoding="utf-8")
                logger.info(
                    "Deliverables: paper.tex repaired — all remaining cite "
                    "keys verified"
                )
            else:
                logger.info(
                    "Deliverables: all %d cite keys verified in references.bib",
                    len(all_cite_keys),
                )
        except Exception:  # noqa: BLE001
            logger.debug("Cite key verification/repair skipped")

    # --- 9. IMP-18: Compile LaTeX to verify paper.tex ---
    if tex_path.exists() and bib_path.exists():
        try:
            from researchclaw.templates.compiler import compile_latex

            compile_result = compile_latex(tex_path, max_attempts=3, timeout=120)
            if compile_result.success:
                logger.info("IMP-18: paper.tex compiles successfully")
                # Keep the generated PDF
                pdf_path = dest / tex_path.stem
                pdf_file = dest / (tex_path.stem + ".pdf")
                if pdf_file.exists():
                    packaged.append(f"{tex_path.stem}.pdf")
            else:
                logger.warning(
                    "IMP-18: paper.tex compilation failed after %d attempts: %s",
                    compile_result.attempts,
                    compile_result.errors[:3],
                )
            if compile_result.fixes_applied:
                logger.info(
                    "IMP-18: Applied %d auto-fixes: %s",
                    len(compile_result.fixes_applied),
                    compile_result.fixes_applied,
                )
        except Exception:  # noqa: BLE001
            logger.debug("IMP-18: LaTeX compilation skipped (non-blocking)")

    if not packaged:
        # Nothing to package — remove empty dir
        dest.rmdir()
        return None

    # --- Write manifest ---
    manifest = {
        "run_id": run_id,
        "target_conference": config.export.target_conference,
        "files": packaged,
        "generated": _utcnow_iso(),
        "notes": {
            "paper_final.md": "Final paper in Markdown format",
            "paper.tex": f"Conference-ready LaTeX ({config.export.target_conference})",
            "references.bib": "BibTeX bibliography (verified citations only)",
            "code/": "Experiment source code with requirements.txt",
            "verification_report.json": "Citation integrity & relevance verification",
            "charts/": "Result visualizations",
        },
    }
    (dest / "manifest.json").write_text(
        json.dumps(manifest, indent=2), encoding="utf-8"
    )

    logger.info(
        "Deliverables packaged: %s (%d items)",
        dest,
        len(packaged),
    )
    return dest


def _version_rollback_stages(
    run_dir: Path, rollback_target: Stage, attempt: int
) -> None:
    """Rename stage directories that will be overwritten by a PIVOT/REFINE.

    For example, if rolling back to Stage 8 (attempt 2), renames:
      stage-08/ → stage-08_v1/
      stage-09/ → stage-09_v1/
      ... up to stage-15/
    """
    import shutil

    rollback_num = int(rollback_target)
    # Stages from rollback target up to RESEARCH_DECISION (15) will be rerun
    decision_num = int(Stage.RESEARCH_DECISION)

    for stage_num in range(rollback_num, decision_num + 1):
        stage_dir = run_dir / f"stage-{stage_num:02d}"
        if stage_dir.exists():
            version_dir = run_dir / f"stage-{stage_num:02d}_v{attempt}"
            if version_dir.exists():
                shutil.rmtree(version_dir)
            stage_dir.rename(version_dir)
            logger.debug(
                "Versioned %s → %s", stage_dir.name, version_dir.name
            )


def _consecutive_empty_metrics(run_dir: Path, pivot_count: int) -> bool:
    """R6-4: Check if the current and previous REFINE cycles both produced empty metrics."""
    # Check the most recent experiment_summary.json (stage-14) and its versioned predecessor.
    # BUG-215: When stage-14/ doesn't exist (renamed to stage-14_v{N} without
    # promotion), fall back to the latest versioned directory as "current".
    current = run_dir / "stage-14" / "experiment_summary.json"
    if not current.exists():
        # Try the latest versioned directory
        for _v in range(pivot_count + 1, 0, -1):
            alt = run_dir / f"stage-14_v{_v}" / "experiment_summary.json"
            if alt.exists():
                current = alt
                break
    prev = run_dir / f"stage-14_v{pivot_count}" / "experiment_summary.json"
    for path in (current, prev):
        if not path.exists():
            return False
        try:
            data = json.loads(path.read_text(encoding="utf-8"))
            # Check all possible metric locations
            has_metrics = False
            ms = data.get("metrics_summary", {})
            if isinstance(ms, dict) and ms:
                has_metrics = True
            br = data.get("best_run", {})
            if isinstance(br, dict) and br.get("metrics"):
                has_metrics = True
            if has_metrics:
                return False  # At least one cycle had real metrics
        except (json.JSONDecodeError, OSError, AttributeError):
            return False
    return True  # Both cycles had empty metrics


def _promote_best_stage14(run_dir: Path, config: RCConfig) -> None:
    """BUG-205: After forced PROCEED, promote the best stage-14 experiment.

    Scans all ``stage-14*`` directories, scores them by primary metric,
    and copies the best experiment_summary.json into ``stage-14/`` if the
    current ``stage-14/`` is not already the best.
    """
    import shutil

    metric_key = config.experiment.metric_key or "primary_metric"
    metric_dir = config.experiment.metric_direction or "maximize"

    candidates: list[tuple[float, Path]] = []
    for d in sorted(run_dir.glob("stage-14*")):
        summary_path = d / "experiment_summary.json"
        if not summary_path.exists():
            continue
        try:
            data = json.loads(summary_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue
        ms = data.get("metrics_summary", {})
        pm_val: float | None = None
        # BUG-DA8-03: Exact match first, then substring fallback
        # (avoids "accuracy" matching "balanced_accuracy")
        if metric_key in ms:
            _v = ms[metric_key]
            try:
                pm_val = float(_v["mean"] if isinstance(_v, dict) else _v)
            except (TypeError, ValueError, KeyError):
                pass
        if pm_val is None:
            for k, v in ms.items():
                if metric_key in k:
                    try:
                        pm_val = float(v["mean"] if isinstance(v, dict) else v)
                    except (TypeError, ValueError, KeyError):
                        pass
                    break
        if pm_val is not None:
            candidates.append((pm_val, d))

    if not candidates:
        return  # nothing to promote

    current_dir = run_dir / "stage-14"

    # Sort: best first
    candidates.sort(key=lambda x: x[0], reverse=(metric_dir == "maximize"))

    # BUG-226: Detect degenerate near-zero metrics (broken normalization or
    # collapsed training).  When minimising, a value >1000x smaller than the
    # second-best almost certainly comes from a degenerate iteration.
    if metric_dir == "minimize" and len(candidates) > 1:
        _bv, _bd = candidates[0]
        _sv = candidates[1][0]
        if 0 < _bv < _sv * 1e-3:
            logger.warning(
                "BUG-226: Degenerate best value %.6g is >1000× smaller than "
                "second-best %.6g — skipping degenerate iteration %s",
                _bv, _sv, _bd.name,
            )
            candidates.pop(0)

    best_val, best_dir = candidates[0]

    # BUG-223: Always write canonical best summary at run root BEFORE any
    # early return, so downstream consumers (Stage 17, Stage 20, Stage 22,
    # VerifiedRegistry) always find experiment_summary_best.json.
    _best_src = best_dir / "experiment_summary.json"
    if _best_src.exists():
        shutil.copy2(_best_src, run_dir / "experiment_summary_best.json")
        logger.info(
            "BUG-223: Wrote experiment_summary_best.json from %s (%.4f)",
            best_dir.name, best_val,
        )
        # BUG-225: Also copy analysis.md from the best iteration so Stage 17
        # doesn't read stale analysis from a degenerate non-versioned stage-14.
        _best_analysis = best_dir / "analysis.md"
        if _best_analysis.exists():
            shutil.copy2(_best_analysis, run_dir / "analysis_best.md")

    if best_dir == current_dir:
        logger.info("BUG-205: stage-14/ already has the best result (%.4f)", best_val)
        return

    # Promote: copy best summary into stage-14/
    current_summary = current_dir / "experiment_summary.json"
    best_summary = best_dir / "experiment_summary.json"
    # BUG-213: Also promote when stage-14/ is missing or empty
    if best_summary.exists():
        current_dir.mkdir(parents=True, exist_ok=True)
        logger.warning(
            "BUG-205: Promoting %s (%.4f) over stage-14/",
            best_dir.name, best_val,
        )
        shutil.copy2(best_summary, current_summary)
        # Also copy charts, analysis, and figure plans if they exist
        for fname in [
            "analysis.md",
            "results_table.tex",
            "figure_plan.json",           # BUG-213: must travel with metrics
            "figure_plan_final.json",     # BUG-213: ditto
        ]:
            src = best_dir / fname
            if src.exists():
                shutil.copy2(src, current_dir / fname)
        # Copy charts directory
        best_charts = best_dir / "charts"
        current_charts = current_dir / "charts"
        if best_charts.is_dir():
            if current_charts.is_dir():
                shutil.rmtree(current_charts)
            shutil.copytree(best_charts, current_charts)


def _check_experiment_quality(
    run_dir: Path, pivot_count: int
) -> tuple[bool, str]:
    """Quality gate before forced PROCEED.

    Returns (ok, message). ok=False means experiment results have critical
    quality issues and the forced-PROCEED paper will likely be poor.
    """
    # BUG-DA8-18: Check experiment_summary_best.json first (repair-promoted)
    summary_path = run_dir / "experiment_summary_best.json"
    if not summary_path.exists():
        summary_path = run_dir / "stage-14" / "experiment_summary.json"
    if not summary_path.exists():
        for v in range(pivot_count, 0, -1):
            alt = run_dir / f"stage-14_v{v}" / "experiment_summary.json"
            if alt.exists():
                summary_path = alt
                break

    if not summary_path.exists():
        return False, "No experiment_summary.json found — no metrics produced"

    try:
        data = json.loads(summary_path.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        return False, "experiment_summary.json is malformed"

    # Check 1: Are all metrics zero?
    ms = data.get("metrics_summary", {})
    if isinstance(ms, dict):
        values: list[float] = []
        for k, v in ms.items():
            if isinstance(v, (int, float)):
                values.append(float(v))
            # BUG-212: metrics_summary values are often dicts {min,max,mean,count}
            elif isinstance(v, dict) and "mean" in v:
                _mv = v["mean"]
                if isinstance(_mv, (int, float)):
                    values.append(float(_mv))
        if values and all(v == 0.0 for v in values):
            return False, "All experiment metrics are zero — experiments likely failed"

    # Check 2: Zero variance across conditions (R13-1)
    # Look for ablation_warnings or condition comparison data
    ablation_warnings = data.get("ablation_warnings", [])
    # BUG-212: Key is "condition_summaries", not "conditions"
    conditions = data.get(
        "condition_summaries", data.get("condition_metrics", {})
    )
    if isinstance(conditions, dict) and len(conditions) >= 2:
        primary_values: list[float] = []
        for cond_name, cond_data in conditions.items():
            if isinstance(cond_data, dict):
                # BUG-212: Primary metric lives inside cond_data["metrics"]
                _metrics = cond_data.get("metrics", cond_data)
                pm = _metrics.get(
                    "primary_metric",
                    _metrics.get("primary_metric_mean"),
                )
                if isinstance(pm, (int, float)):
                    primary_values.append(float(pm))
        if len(primary_values) >= 2 and len(set(primary_values)) == 1:
            return False, (
                f"All {len(primary_values)} conditions have identical primary_metric "
                f"({primary_values[0]}) — condition implementations are likely broken"
            )

    # Check 3: Too many ablation warnings
    if isinstance(ablation_warnings, list) and len(ablation_warnings) >= 3:
        return False, (
            f"{len(ablation_warnings)} ablation warnings — most conditions "
            f"produce identical results"
        )

    # Check 4: Analysis quality score (if available)
    quality = data.get("analysis_quality", data.get("quality_score"))
    if isinstance(quality, (int, float)) and quality < 3.0:
        return False, f"Analysis quality score {quality}/10 — below minimum threshold"

    return True, "Quality checks passed"


def _read_pivot_count(run_dir: Path) -> int:
    """Read how many PIVOT/REFINE decisions have been made so far."""
    history_path = run_dir / "decision_history.json"
    if not history_path.exists():
        return 0
    try:
        data = json.loads(history_path.read_text(encoding="utf-8"))
        if isinstance(data, list):
            return len(data)
    except (json.JSONDecodeError, OSError):
        pass
    return 0


def _record_decision_history(
    run_dir: Path, decision: str, rollback_target: Stage, attempt: int
) -> None:
    """Append a decision event to the history log."""
    history_path = run_dir / "decision_history.json"
    history: list[dict[str, object]] = []
    if history_path.exists():
        try:
            data = json.loads(history_path.read_text(encoding="utf-8"))
            if isinstance(data, list):
                history = data
        except (json.JSONDecodeError, OSError):
            pass
    history.append({
        "decision": decision,
        "rollback_target": rollback_target.name,
        "rollback_stage_num": int(rollback_target),
        "attempt": attempt,
        "timestamp": _utcnow_iso(),
    })
    history_path.write_text(
        json.dumps(history, indent=2), encoding="utf-8"
    )


logger = logging.getLogger(__name__)


def _read_quality_score(run_dir: Path) -> float | None:
    """Extract quality score from the most recent quality_report.json."""
    report_path = run_dir / "stage-20" / "quality_report.json"
    if not report_path.exists():
        return None
    try:
        data = json.loads(report_path.read_text(encoding="utf-8"))
        if isinstance(data, dict):
            # Try common keys: score_1_to_10, score, quality_score
            for key in ("score_1_to_10", "score", "quality_score", "overall_score"):
                if key in data:
                    return float(data[key])
    except (json.JSONDecodeError, ValueError, TypeError):
        pass
    return None


def _write_iteration_context(
    run_dir: Path, iteration: int, reviews: str, quality_score: float | None
) -> None:
    """Write iteration feedback file so next round can read it."""
    ctx = {
        "iteration": iteration,
        "quality_score": quality_score,
        "reviews_excerpt": reviews[:3000] if reviews else "",
        "generated": _utcnow_iso(),
    }
    (run_dir / "iteration_context.json").write_text(
        json.dumps(ctx, indent=2), encoding="utf-8"
    )


def execute_iterative_pipeline(
    *,
    run_dir: Path,
    run_id: str,
    config: RCConfig,
    adapters: AdapterBundle,
    auto_approve_gates: bool = False,
    kb_root: Path | None = None,
    max_iterations: int = 3,
    quality_threshold: float = 7.0,
    convergence_rounds: int = 2,
) -> dict[str, object]:
    """Run the full pipeline with iterative quality improvement.

    After the first full pass (stages 1-22), if the quality gate score is below
    *quality_threshold*, re-run stages 16-22 (paper writing + finalization) with
    review feedback injected.  Stop when:
      - Score >= quality_threshold, OR
      - Score hasn't improved for *convergence_rounds* consecutive iterations, OR
      - *max_iterations* reached.

    Returns a summary dict with iteration history.
    """
    iteration_scores: list[float | None] = []
    all_results: list[list[StageResult]] = []

    # --- First full pass ---
    logger.info("Iteration 1/%d: running full pipeline (stages 1-22)", max_iterations)
    results = execute_pipeline(
        run_dir=run_dir,
        run_id=f"{run_id}-iter1",
        config=config,
        adapters=adapters,
        auto_approve_gates=auto_approve_gates,
        kb_root=kb_root,
    )
    all_results.append(results)
    score = _read_quality_score(run_dir)
    iteration_scores.append(score)
    logger.info("Iteration 1 score: %s", score)

    # --- Iterative improvement ---
    for iteration in range(2, max_iterations + 1):
        # Check if we've met quality threshold
        if score is not None and score >= quality_threshold:
            logger.info(
                "Quality threshold %.1f met (score=%.1f). Stopping.",
                quality_threshold,
                score,
            )
            break

        # Check convergence (score hasn't improved)
        if len(iteration_scores) >= convergence_rounds:
            recent = iteration_scores[-convergence_rounds:]
            if all(s is not None for s in recent):
                recent_scores = [float(s) for s in recent if s is not None]
                if max(recent_scores) - min(recent_scores) < 0.5:
                    logger.info(
                        "Convergence detected: scores %s unchanged for %d rounds. Stopping.",
                        recent,
                        convergence_rounds,
                    )
                    break

        # Write iteration context with feedback from reviews
        reviews_text = ""
        reviews_path = run_dir / "stage-18" / "reviews.md"
        if reviews_path.exists():
            reviews_text = reviews_path.read_text(encoding="utf-8")
        _write_iteration_context(run_dir, iteration, reviews_text, score)

        # Re-run from PAPER_OUTLINE (stage 16) through EXPORT_PUBLISH (stage 22)
        logger.info(
            "Iteration %d/%d: re-running stages 16-22 with feedback",
            iteration,
            max_iterations,
        )
        results = execute_pipeline(
            run_dir=run_dir,
            run_id=f"{run_id}-iter{iteration}",
            config=config,
            adapters=adapters,
            from_stage=Stage.PAPER_OUTLINE,
            auto_approve_gates=auto_approve_gates,
            kb_root=kb_root,
        )
        all_results.append(results)
        score = _read_quality_score(run_dir)
        iteration_scores.append(score)
        logger.info("Iteration %d score: %s", iteration, score)

    # --- Build iterative summary ---
    converged = False
    if len(iteration_scores) >= convergence_rounds:
        recent_window = iteration_scores[-convergence_rounds:]
        if all(s is not None for s in recent_window):
            recent_scores = [float(s) for s in recent_window if s is not None]
            converged = max(recent_scores) - min(recent_scores) < 0.5

    summary: dict[str, object] = {
        "run_id": run_id,
        "total_iterations": len(iteration_scores),
        "iteration_scores": iteration_scores,
        "quality_threshold": quality_threshold,
        "converged": converged,
        "final_score": iteration_scores[-1] if iteration_scores else None,
        "met_threshold": score is not None and score >= quality_threshold,
        "stages_per_iteration": [len(r) for r in all_results],
        "generated": _utcnow_iso(),
    }
    (run_dir / "iteration_summary.json").write_text(
        json.dumps(summary, indent=2, default=str), encoding="utf-8"
    )

    # --- Package deliverables into a single folder ---
    try:
        deliverables_dir = _package_deliverables(run_dir, run_id, config)
        if deliverables_dir is not None:
            print(f"[{run_id}] Deliverables packaged → {deliverables_dir}")
    except Exception:  # noqa: BLE001
        logger.warning("Deliverables packaging failed (non-blocking)")

    return summary


def _metaclaw_post_pipeline(
    config: RCConfig,
    results: list[StageResult],
    lessons: list[object],
    run_id: str,
    run_dir: Path,
) -> None:
    """MetaClaw bridge: post-pipeline hook.

    1. Convert high-severity lessons into MetaClaw skills.
    2. Record skill effectiveness feedback.
    3. Signal session end to MetaClaw proxy.
    """
    bridge = getattr(config, "metaclaw_bridge", None)
    if not bridge or not getattr(bridge, "enabled", False):
        return

    from researchclaw.llm.client import LLMClient

    # 1. Lesson-to-skill conversion
    l2s = getattr(bridge, "lesson_to_skill", None)
    if l2s and getattr(l2s, "enabled", False) and lessons:
        try:
            from researchclaw.metaclaw_bridge.lesson_to_skill import (
                convert_lessons_to_skills,
            )

            min_sev = getattr(l2s, "min_severity", "warning")
            llm = LLMClient.from_rc_config(config)
            new_skills = convert_lessons_to_skills(
                lessons,
                llm,
                getattr(bridge, "skills_dir", "~/.metaclaw/skills"),
                min_severity=min_sev,
                max_skills=getattr(l2s, "max_skills_per_run", 3),
            )
            if new_skills:
                logger.info(
                    "MetaClaw: generated %d new skills from lessons: %s",
                    len(new_skills),
                    new_skills,
                )
        except Exception:  # noqa: BLE001
            logger.warning("MetaClaw lesson-to-skill conversion failed", exc_info=True)

    # 2. Skill effectiveness feedback
    try:
        from researchclaw.metaclaw_bridge.skill_feedback import (
            SkillFeedbackStore,
            record_stage_skills,
        )
        from researchclaw.metaclaw_bridge.stage_skill_map import get_stage_config

        feedback_store = SkillFeedbackStore(run_dir / "evolution" / "skill_effectiveness.jsonl")
        for result in results:
            stage_num = int(getattr(result, "stage", 0))
            stage_name = {
                1: "topic_init", 2: "problem_decompose", 3: "search_strategy",
                4: "literature_collect", 5: "literature_screen", 6: "knowledge_extract",
                7: "synthesis", 8: "hypothesis_gen", 9: "experiment_design",
                10: "code_generation", 11: "resource_planning", 12: "experiment_run",
                13: "iterative_refine", 14: "result_analysis", 15: "research_decision",
                16: "paper_outline", 17: "paper_draft", 18: "peer_review",
                19: "paper_revision", 20: "quality_gate", 21: "knowledge_archive",
                22: "export_publish", 23: "citation_verify",
            }.get(stage_num, "")
            if not stage_name:
                continue

            stage_config = get_stage_config(stage_name)
            active_skills = stage_config.get("skills", [])
            status = str(getattr(result, "status", ""))
            success = "done" in status.lower()

            if active_skills:
                record_stage_skills(
                    feedback_store,
                    stage_name,
                    run_id,
                    success,
                    active_skills,
                )
    except Exception:  # noqa: BLE001
        logger.warning("MetaClaw skill feedback recording failed")

    # 3. Signal session end (fire-and-forget)
    try:
        from researchclaw.metaclaw_bridge.session import MetaClawSession
        import json as _json
        import urllib.request as _urllib_req

        session = MetaClawSession(run_id)
        end_headers = session.end()
        # Send a minimal request to signal session end
        proxy_url = getattr(bridge, "proxy_url", "http://localhost:30000")
        url = f"{proxy_url.rstrip('/')}/v1/chat/completions"
        body = _json.dumps({
            "model": "session-end",
            "messages": [{"role": "user", "content": "session complete"}],
            "max_tokens": 1,
        }).encode("utf-8")
        headers = {"Content-Type": "application/json"}
        headers.update(end_headers)
        req = _urllib_req.Request(url, data=body, headers=headers)
        try:
            _urllib_req.urlopen(req, timeout=5)
        except Exception:  # noqa: BLE001
            pass  # Best-effort signal
    except Exception:  # noqa: BLE001
        pass


================================================
FILE: researchclaw/pipeline/stage_impls/__init__.py
================================================
"""Stage implementation modules for the research pipeline executor."""


================================================
FILE: researchclaw/pipeline/stage_impls/_analysis.py
================================================
"""Stages 14-15: Result analysis and research decision."""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain, _is_ml_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _build_context_preamble,
    _chat_with_prompt,
    _collect_experiment_results,
    _collect_json_context,
    _get_evolution_overlay,
    _multi_perspective_generate,
    _read_prior_artifact,
    _safe_json_loads,
    _synthesize_perspectives,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_result_analysis(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    # --- Collect experiment data ---
    exp_data = _collect_experiment_results(
        run_dir,
        metric_key=config.experiment.metric_key,
        metric_direction=config.experiment.metric_direction,
    )
    runs_dir = _read_prior_artifact(run_dir, "runs/") or ""
    context = ""
    if runs_dir:
        context = _collect_json_context(Path(runs_dir), max_files=30)

    # --- R13-1: Merge Stage 13 (ITERATIVE_REFINE) results if available ---
    # Stage 13 stores richer per-condition metrics in refinement_log.json
    # that _collect_experiment_results() misses (it only scans runs/ dirs).
    _refine_log_text = _read_prior_artifact(run_dir, "refinement_log.json")
    if _refine_log_text:
        try:
            _refine_data = json.loads(_refine_log_text)
            _best_iter = None
            _best_ver = _refine_data.get("best_version", "")

            def _get_best_sandbox(it: dict) -> dict:
                """BUG-181: Metrics may be in sandbox or sandbox_after_fix."""
                sbx = it.get("sandbox", {})
                if sbx.get("metrics"):
                    return sbx
                sbx_fix = it.get("sandbox_after_fix", {})
                if sbx_fix.get("metrics"):
                    return sbx_fix
                return sbx

            for _it in _refine_data.get("iterations", []):
                _sbx = _get_best_sandbox(_it)
                _it_metrics = _sbx.get("metrics", {})
                if _it.get("version_dir", "") == _best_ver and _it_metrics:
                    _best_iter = _it
                    break
            # If no version match, take the first iteration with metrics
            if _best_iter is None:
                for _it in _refine_data.get("iterations", []):
                    _sbx = _get_best_sandbox(_it)
                    if _sbx.get("metrics"):
                        _best_iter = _it
                        break
            if _best_iter is not None:
                _sbx = _get_best_sandbox(_best_iter)
                _refine_metrics = _sbx.get("metrics", {})
                # BUG-165 fix: Prefer Stage 13 refinement data when it is
                # actually better.  The old `or True` unconditionally
                # replaced existing data, causing catastrophic regressions
                # (BUG-205: v1=78.93% destroyed by v3=8.65%).
                _refine_is_better = not exp_data["metrics_summary"]
                if not _refine_is_better and _refine_metrics:
                    # Compare primary_metric values to decide
                    _mkey = config.experiment.metric_key or "primary_metric"
                    _mdir = config.experiment.metric_direction or "maximize"
                    _existing_pm: float | None = None
                    _refine_pm: float | None = None
                    # BUG-214: Use exact match first, then substring fallback
                    # to avoid "accuracy" matching "balanced_accuracy".
                    _ms_items = list((exp_data.get("metrics_summary") or {}).items())
                    for _k, _v in _ms_items:
                        if _k == _mkey:
                            try:
                                _existing_pm = float(_v["mean"] if isinstance(_v, dict) else _v)
                            except (TypeError, ValueError, KeyError):
                                pass
                            break
                    else:
                        for _k, _v in _ms_items:
                            if _mkey in _k:
                                try:
                                    _existing_pm = float(_v["mean"] if isinstance(_v, dict) else _v)
                                except (TypeError, ValueError, KeyError):
                                    pass
                                break
                    _refine_items = list(_refine_metrics.items())
                    for _k, _v in _refine_items:
                        if _k == _mkey:
                            try:
                                _refine_pm = float(_v)
                            except (TypeError, ValueError):
                                pass
                            break
                    else:
                        for _k, _v in _refine_items:
                            if _mkey in _k:
                                try:
                                    _refine_pm = float(_v)
                                except (TypeError, ValueError):
                                    pass
                                break
                    if _existing_pm is None:
                        _refine_is_better = True  # no existing data
                    elif _refine_pm is not None:
                        if _mdir == "maximize":
                            _refine_is_better = _refine_pm > _existing_pm
                        else:
                            _refine_is_better = _refine_pm < _existing_pm
                    logger.info(
                        "Stage 14: Refine metric comparison: existing=%s, refine=%s, "
                        "direction=%s → refine_is_better=%s",
                        _existing_pm, _refine_pm, _mdir, _refine_is_better,
                    )
                if _refine_metrics and _refine_is_better:
                    # Refinement has richer data — rebuild metrics_summary from it
                    _new_summary: dict[str, dict[str, float | None]] = {}
                    for _mk, _mv in _refine_metrics.items():
                        try:
                            _fv = float(_mv)
                            _new_summary[_mk] = {
                                "min": round(_fv, 6),
                                "max": round(_fv, 6),
                                "mean": round(_fv, 6),
                                "count": 1,
                            }
                        except (ValueError, TypeError):
                            pass
                    if _new_summary:
                        exp_data["metrics_summary"] = _new_summary
                        # Also update best_run with refinement data
                        exp_data["best_run"] = {
                            "run_id": "iterative-refine-best",
                            "task_id": "sandbox-main",
                            "status": "completed",
                            "metrics": {
                                k: v for k, v in _refine_metrics.items()
                            },
                            "elapsed_sec": _sbx.get("elapsed_sec", 0),
                            "stdout": "",  # omit for brevity
                            "stderr": _sbx.get("stderr", ""),
                            "timed_out": _sbx.get("timed_out", False),
                        }
                        # Rebuild latex table
                        _ltx = [
                            r"\begin{table}[h]", r"\centering",
                            r"\caption{Experiment Results (Best Refinement Iteration)}",
                            r"\begin{tabular}{lrrrr}", r"\hline",
                            r"Metric & Min & Max & Mean & N \\", r"\hline",
                        ]
                        for _col in sorted(_new_summary.keys()):
                            _s = _new_summary[_col]
                            _ltx.append(
                                f"{_col} & {_s['min']:.4f} & {_s['max']:.4f} "
                                f"& {_s['mean']:.4f} & {_s['count']} \\\\"
                            )
                        _ltx.extend([r"\hline", r"\end{tabular}", r"\end{table}"])
                        exp_data["latex_table"] = "\n".join(_ltx)
                        # Count unique conditions (keys without 'seed' and not ending in _mean/_std)
                        _conditions = {
                            k for k in _refine_metrics
                            if "seed" not in k and not k.endswith("_std")
                        }
                        exp_data["runs"] = [exp_data["best_run"]]
                        # Store condition count for accurate reporting
                        exp_data["best_run"]["condition_count"] = len(_conditions)
                        if not context:
                            context = json.dumps(
                                {"refinement_best_metrics": _refine_metrics},
                                indent=2, default=str,
                            )
                        _bm_val = _refine_data.get("best_metric")
                        logger.info(
                            "R13-1: Merged %d metrics from refinement_log (best_metric=%.4f)",
                            len(_refine_metrics),
                            float(_bm_val) if isinstance(_bm_val, (int, float)) else 0.0,
                        )
        except (json.JSONDecodeError, OSError, KeyError):
            logger.warning("R13-1: Failed to parse refinement_log.json, using Stage 12 data")

    # --- R19-2: Extract PAIRED comparisons from refinement stdout ---
    from researchclaw.experiment.sandbox import extract_paired_comparisons as _extract_paired

    _all_paired: list[dict[str, object]] = []
    # First: from _collect_experiment_results (Stage 12 runs/)
    if exp_data.get("paired_comparisons"):
        _all_paired.extend(exp_data["paired_comparisons"])
    # Second: from refinement_log iterations (Stage 13)
    if _refine_log_text:
        try:
            _rl = json.loads(_refine_log_text)
            for _it in _rl.get("iterations", []):
                for _sbx_key in ("sandbox", "sandbox_after_fix"):
                    _sbx_stdout = (_it.get(_sbx_key) or {}).get("stdout", "")
                    if _sbx_stdout:
                        _all_paired.extend(_extract_paired(_sbx_stdout))
        except (json.JSONDecodeError, OSError):
            pass

    # --- R19-3: Build structured condition_summaries from metrics ---
    _condition_summaries: dict[str, dict[str, Any]] = {}
    _ms = exp_data.get("metrics_summary", {})
    _best_metrics = {}
    if exp_data.get("best_run") and isinstance(exp_data["best_run"], dict):
        _best_metrics = exp_data["best_run"].get("metrics", {})

    # Group metrics by condition prefix (e.g., "ppo/primary_metric" → condition "ppo")
    for _mk, _mv in _best_metrics.items():
        parts = _mk.split("/")
        if len(parts) >= 2:
            cond = parts[0]
            metric_name = parts[-1]
            if cond not in _condition_summaries:
                _condition_summaries[cond] = {"metrics": {}}
            try:
                _condition_summaries[cond]["metrics"][metric_name] = float(_mv)
            except (ValueError, TypeError):
                pass

    # BUG-09 fix: If no condition summaries were built (metrics don't use
    # condition/metric format), try to extract from metrics_summary or
    # structured_results so FigureAgent has data to work with.
    if not _condition_summaries and _ms:
        # Try to parse condition data from metrics_summary keys
        for _mk, _mv in _ms.items():
            parts = _mk.split("/")
            if len(parts) >= 2:
                cond = parts[0]
                metric_name = parts[-1]
                if cond not in _condition_summaries:
                    _condition_summaries[cond] = {"metrics": {}}
                try:
                    # BUG-182: metrics_summary values are dicts {min,max,mean,count},
                    # not plain floats. Extract the mean value.
                    if isinstance(_mv, dict):
                        _val = float(_mv["mean"]) if "mean" in _mv else None
                    else:
                        _val = float(_mv)
                    if _val is not None:
                        _condition_summaries[cond]["metrics"][metric_name] = _val
                except (ValueError, TypeError, KeyError):
                    pass
    if not _condition_summaries:
        # Last resort: build from structured_results condition keys
        _sr = exp_data.get("structured_results", {})
        if isinstance(_sr, dict):
            for _sk, _sv in _sr.items():
                if isinstance(_sv, dict) and _sk not in ("metadata", "config"):
                    _condition_summaries[_sk] = {"metrics": {}}
                    for _smk, _smv in _sv.items():
                        try:
                            _condition_summaries[_sk]["metrics"][_smk] = float(_smv)
                        except (ValueError, TypeError):
                            pass

    # R33: Build per-seed data structure (needed for CIs and paired tests below)
    _seed_data: dict[str, dict[int, float]] = {}  # {condition: {seed: value}}
    for _mk, _mv in _best_metrics.items():
        parts = _mk.split("/")
        # Pattern: condition/regime/seed_id/primary_metric
        if len(parts) >= 4 and parts[-1] == config.experiment.metric_key:
            cond = parts[0]
            try:
                seed_id = int(parts[2])
                val = float(_mv)
                _seed_data.setdefault(cond, {})[seed_id] = val
            except (ValueError, TypeError):
                pass

    # Enrich condition summaries with seed counts, success rates, and CIs
    for _ck, _cv in _condition_summaries.items():
        # Look for success_rate in metrics
        sr_key = f"{_ck}/success_rate"
        if sr_key in _best_metrics:
            try:
                _cv["success_rate"] = float(_best_metrics[sr_key])
            except (ValueError, TypeError):
                pass
        # Count seed-level entries to estimate n_seeds
        _seed_count = 0
        for _mk in _best_metrics:
            if _mk.startswith(f"{_ck}/") and "seed" in _mk.lower():
                _seed_count += 1
        if _seed_count > 0:
            _cv["n_seed_metrics"] = _seed_count

        # R33: Compute mean ± std and bootstrap 95% CI from per-seed data
        if _ck in _seed_data and len(_seed_data[_ck]) >= 3:
            _vals = list(_seed_data[_ck].values())
            import statistics as _stats_mod
            _mean = _stats_mod.mean(_vals)
            _std = _stats_mod.stdev(_vals)
            _cv["metrics"][f"{config.experiment.metric_key}_mean"] = round(_mean, 6)
            _cv["metrics"][f"{config.experiment.metric_key}_std"] = round(_std, 6)
            _cv["n_seeds"] = len(_vals)
            # Bootstrap 95% CI (use local RNG to avoid corrupting global state)
            import random as _rng_mod
            _rng_local = _rng_mod.Random(42)
            _boot_means = []
            for _ in range(1000):
                _sample = [_rng_local.choice(_vals) for _ in range(len(_vals))]
                _boot_means.append(_stats_mod.mean(_sample))
            _boot_means.sort()
            _ci_low = round(_boot_means[int(0.025 * len(_boot_means))], 6)
            _ci_high = round(_boot_means[int(0.975 * len(_boot_means))], 6)
            # IMP-16: Sanity check — CI must contain the mean
            if _ci_low > _mean or _ci_high < _mean:
                logger.warning(
                    "Bootstrap CI [%.4f, %.4f] does not contain mean %.4f "
                    "for condition %s — replacing CI with mean ± 1.96*SE",
                    _ci_low, _ci_high, _mean, _ck,
                )
                _se = _std / (len(_vals) ** 0.5)
                _ci_low = round(_mean - 1.96 * _se, 6)
                _ci_high = round(_mean + 1.96 * _se, 6)
            _cv["ci95_low"] = _ci_low
            _cv["ci95_high"] = _ci_high

    # Count totals
    _total_conditions = len(_condition_summaries) if _condition_summaries else None
    _total_metrics = len(_best_metrics) if _best_metrics else None

    # --- R33: Pipeline-level paired computation as fallback ---
    # If the experiment code's PAIRED lines are sparse or suspicious (e.g.,
    # all identical t-stats), compute fresh paired tests from per-seed data.
    # (_seed_data was built above before condition summary enrichment)
    if len(_seed_data) >= 2:
        # Find common seeds across conditions
        _all_seeds_sets = [set(v.keys()) for v in _seed_data.values()]
        _common_seeds = set.intersection(*_all_seeds_sets) if _all_seeds_sets else set()

        if len(_common_seeds) >= 3:
            _cond_names_sorted = sorted(_seed_data.keys())
            _pipeline_paired: list[dict[str, object]] = []
            # Compare each condition against the first baseline (alphabetically)
            _baseline_cond = _cond_names_sorted[0]
            for _other_cond in _cond_names_sorted[1:]:
                _diffs = []
                for _sid in sorted(_common_seeds):
                    _diffs.append(
                        _seed_data[_other_cond][_sid] - _seed_data[_baseline_cond][_sid]
                    )
                if _diffs:
                    import statistics
                    _n = len(_diffs)
                    _mean_d = statistics.mean(_diffs)
                    _std_d = statistics.stdev(_diffs) if _n > 1 else 0.0
                    _t = (_mean_d / (_std_d / (_n ** 0.5))) if _std_d > 0 else 0.0
                    _df = _n - 1
                    # Two-tailed p-value using t-distribution
                    import math
                    try:
                        from scipy.stats import t as _t_dist
                        _p = float(2 * _t_dist.sf(abs(_t), _df))
                    except ImportError:
                        _p = 2 * (1 - 0.5 * (1 + math.erf(abs(_t) / (2 ** 0.5))))
                        if _df < 30:
                            _p = min(1.0, _p * (1 + 2.5 / max(_df, 1)))
                    _pipeline_paired.append({
                        "method": _other_cond,
                        "baseline": _baseline_cond,
                        "mean_diff": round(_mean_d, 6),
                        "std_diff": round(_std_d, 6),
                        "t_stat": round(_t, 4),
                        "p_value": round(_p, 6),
                        "n_seeds": _n,
                        "source": "pipeline_computed",
                    })

            # Use pipeline-computed if experiment code's are suspicious
            _exp_t_stats = {round(p.get("t_stat", 0), 4) for p in _all_paired}
            _all_identical = len(_exp_t_stats) <= 1 and len(_all_paired) > 1
            if _pipeline_paired and (_all_identical or len(_all_paired) < len(_pipeline_paired)):
                logger.info(
                    "R33: Using %d pipeline-computed paired tests (experiment code had %d, identical=%s)",
                    len(_pipeline_paired), len(_all_paired), _all_identical,
                )
                _all_paired = _pipeline_paired

    # --- P8: Detect identical conditions (broken ablations) ---
    _ablation_warnings: list[str] = []
    if _condition_summaries and len(_condition_summaries) >= 2:
        _cond_names = sorted(_condition_summaries.keys())
        for _i in range(len(_cond_names)):
            for _j in range(_i + 1, len(_cond_names)):
                _c1, _c2 = _cond_names[_i], _cond_names[_j]
                _s1_raw = _condition_summaries[_c1]
                _s2_raw = _condition_summaries[_c2]
                # BUG-133 fix: compare inner metrics dicts, not top-level keys
                _s1_m = _s1_raw.get("metrics", {}) if isinstance(_s1_raw, dict) else {}
                _s2_m = _s2_raw.get("metrics", {}) if isinstance(_s2_raw, dict) else {}
                if not isinstance(_s1_m, dict):
                    _s1_m = {}
                if not isinstance(_s2_m, dict):
                    _s2_m = {}
                _shared_keys = set(_s1_m.keys()) & set(_s2_m.keys())
                if not _shared_keys:
                    continue
                _all_equal = True
                for _sk in _shared_keys:
                    _v1 = _s1_m[_sk]
                    _v2 = _s2_m[_sk]
                    if _v1 != _v2:
                        _all_equal = False
                        break
                if _all_equal and _shared_keys:
                    _warn = (
                        f"ABLATION FAILURE: Conditions '{_c1}' and '{_c2}' produce "
                        f"identical outputs across all {len(_shared_keys)} metrics. "
                        f"The ablation is invalid — the differentiating parameter "
                        f"is likely not used in the code."
                    )
                    _ablation_warnings.append(_warn)
                    logger.warning("P8: %s", _warn)
                elif _shared_keys:
                    # R5-BUG-03: Also flag near-identical conditions (< 1% relative diff)
                    _near_identical = True
                    for _sk in _shared_keys:
                        _v1 = _s1_m[_sk]
                        _v2 = _s2_m[_sk]
                        try:
                            _v1f, _v2f = float(_v1), float(_v2)
                            _denom = max(abs(_v1f), abs(_v2f), 1e-12)
                            if abs(_v1f - _v2f) / _denom > 0.01:
                                _near_identical = False
                                break
                        except (TypeError, ValueError):
                            _near_identical = False
                            break
                    if _near_identical:
                        _warn = (
                            f"ABLATION WARNING: Conditions '{_c1}' and '{_c2}' produce "
                            f"near-identical outputs (<1% relative difference) across "
                            f"all {len(_shared_keys)} metrics. The ablation may be trivial."
                        )
                        _ablation_warnings.append(_warn)
                        logger.warning("P8: %s", _warn)

    # --- Improvement B: Validate seed counts ---
    _seed_insufficiency_warnings: list[str] = []
    for _sc_name, _sc_seeds in _seed_data.items():
        _n_seeds = len(_sc_seeds)
        if 0 < _n_seeds < 3:
            _warn = (
                f"SEED_INSUFFICIENCY: Condition '{_sc_name}' has only "
                f"{_n_seeds} seed(s) (minimum 3 required for statistical validity)"
            )
            _seed_insufficiency_warnings.append(_warn)
            logger.warning("B: %s", _warn)

    # --- Write structured experiment summary ---
    summary_payload = {
        "metrics_summary": exp_data["metrics_summary"],
        "total_runs": len(exp_data["runs"]),
        "best_run": exp_data["best_run"],
        "latex_table": exp_data["latex_table"],
        "generated": _utcnow_iso(),
    }
    if _seed_insufficiency_warnings:
        summary_payload["seed_insufficiency_warnings"] = _seed_insufficiency_warnings
    # R13-1: Detect zero-variance across conditions (all conditions identical primary metric)
    if _condition_summaries and len(_condition_summaries) >= 2:
        _primary_vals = []
        for _cs in _condition_summaries.values():
            if isinstance(_cs, dict):
                # Try 'metrics' dict first (actual structure), then 'primary_metric' fallback
                _metrics = _cs.get("metrics", {})
                if isinstance(_metrics, dict) and _metrics:
                    _pv_candidate = next(iter(_metrics.values()), None)
                    if isinstance(_pv_candidate, dict):
                        _pv_candidate = _pv_candidate.get("mean")
                    if isinstance(_pv_candidate, (int, float)):
                        _primary_vals.append(_pv_candidate)
                        continue
                _pm = _cs.get("primary_metric", {})
                _pv = _pm.get("mean") if isinstance(_pm, dict) else _pm
                if isinstance(_pv, (int, float)):
                    _primary_vals.append(_pv)
        if len(_primary_vals) >= 2 and len(set(_primary_vals)) == 1:
            _zv_warn = (
                f"ZERO VARIANCE: All {len(_primary_vals)} conditions have "
                f"identical primary_metric ({_primary_vals[0]}). "
                f"Experiment condition wiring is likely broken."
            )
            _ablation_warnings.append(_zv_warn)
            logger.warning("R13-1: %s", _zv_warn)

    if _ablation_warnings:
        summary_payload["ablation_warnings"] = _ablation_warnings
    if _all_paired:
        summary_payload["paired_comparisons"] = _all_paired
    if _condition_summaries:
        summary_payload["condition_summaries"] = _condition_summaries
        summary_payload["condition_metrics"] = _condition_summaries  # alias for quality gate
        summary_payload["total_conditions"] = _total_conditions
    if _total_metrics:
        summary_payload["total_metric_keys"] = _total_metrics
    (stage_dir / "experiment_summary.json").write_text(
        json.dumps(summary_payload, indent=2, default=str), encoding="utf-8"
    )
    if exp_data["latex_table"]:
        (stage_dir / "results_table.tex").write_text(
            exp_data["latex_table"], encoding="utf-8"
        )

    # --- Build data-augmented prompt ---
    preamble = _build_context_preamble(
        config, run_dir, include_goal=True, include_hypotheses=True
    )
    data_context = ""
    if exp_data["metrics_summary"]:
        lines = ["\n## Quantitative Results"]
        for mk, mv in exp_data["metrics_summary"].items():
            if isinstance(mv, dict):
                lines.append(
                    f"- {mk}: mean={mv.get('mean', '?')}, min={mv.get('min', '?')}, "
                    f"max={mv.get('max', '?')}, n={mv.get('count', '?')}"
                )
        data_context = "\n".join(lines)

    # Append structured results if available
    if exp_data.get("structured_results"):
        structured_text = json.dumps(
            exp_data["structured_results"], indent=2, default=str
        )
        # Truncate to avoid blowing up context
        if len(structured_text) > 6000:
            structured_text = structured_text[:6000] + "\n... (truncated)"
        data_context += (
            f"\n\n## Structured Experiment Results (from results.json)\n"
            f"```json\n{structured_text}\n```"
        )

    # P8: Inject ablation warnings into data context
    if _ablation_warnings:
        data_context += "\n\nCRITICAL ABLATION WARNINGS:\n"
        for _aw in _ablation_warnings:
            data_context += f"- {_aw}\n"
        data_context += (
            "\nYou MUST address these in your analysis. Identical conditions "
            "mean the ablation design is broken and the comparison is meaningless.\n"
        )

    if llm is not None:
        _pm = prompts or PromptManager()
        from researchclaw.prompts import DEBATE_ROLES_ANALYSIS  # noqa: PLC0415

        # --- Multi-perspective debate ---
        perspectives_dir = stage_dir / "perspectives"
        variables = {
            "preamble": preamble,
            "data_context": data_context,
            "context": context,
        }
        perspectives = _multi_perspective_generate(
            llm, DEBATE_ROLES_ANALYSIS, variables, perspectives_dir
        )
        # --- Synthesize into unified analysis ---
        analysis = _synthesize_perspectives(
            llm, perspectives, "analysis_synthesize", _pm
        )
    else:
        # Template with real data if available
        ms = exp_data["metrics_summary"]
        metrics_block = ""
        if ms:
            for mk, mv in ms.items():
                if isinstance(mv, dict):
                    metrics_block += (
                        f"- **{mk}**: mean={mv.get('mean')}, "
                        f"min={mv.get('min')}, max={mv.get('max')}, n={mv.get('count')}\n"
                    )
        else:
            metrics_block = f"- Primary metric key: `{config.experiment.metric_key}`\n- No quantitative data yet.\n"

        analysis = f"""# Result Analysis

## Metrics Summary
{metrics_block}
## Comparative Findings
- Proposed approach results from {len(exp_data["runs"])} run(s) collected.

## Statistical Checks
- Recommend confidence interval and seed-wise variance reporting.

## Limitations
- Limited runs and synthetic constraints.

## Conclusion
- Proceed to decision stage with moderate confidence.

Generated: {_utcnow_iso()}
"""
    (stage_dir / "analysis.md").write_text(analysis, encoding="utf-8")

    artifacts = ["analysis.md", "experiment_summary.json"]
    if (stage_dir / "results_table.tex").exists():
        artifacts.append("results_table.tex")

    # IMP-6 + FA: Generate charts early (Stage 14) so paper draft can reference them
    # Try FigureAgent first (multi-agent intelligent charts), fall back to visualize.py
    _figure_plan_saved = False
    if config.experiment.figure_agent.enabled and llm is not None:
        try:
            from researchclaw.agents.figure_agent import FigureOrchestrator
            from researchclaw.agents.figure_agent.orchestrator import FigureAgentConfig as _FACfg

            _fa_cfg = _FACfg(
                enabled=True,
                min_figures=config.experiment.figure_agent.min_figures,
                max_figures=config.experiment.figure_agent.max_figures,
                max_iterations=config.experiment.figure_agent.max_iterations,
                render_timeout_sec=config.experiment.figure_agent.render_timeout_sec,
                use_docker=config.experiment.figure_agent.use_docker,
                docker_image=config.experiment.figure_agent.docker_image,
                output_format=config.experiment.figure_agent.output_format,
                gemini_api_key=config.experiment.figure_agent.gemini_api_key,
                gemini_model=config.experiment.figure_agent.gemini_model,
                nano_banana_enabled=config.experiment.figure_agent.nano_banana_enabled,
                strict_mode=config.experiment.figure_agent.strict_mode,
                dpi=config.experiment.figure_agent.dpi,
            )
            _fa = FigureOrchestrator(llm, _fa_cfg, stage_dir=stage_dir)

            # Build conditions list from condition_summaries
            _fa_conditions = list(_condition_summaries.keys()) if _condition_summaries else []

            # BUG-09 fix: pass best_run metrics as fallback data if
            # structured_results is empty, so Planner has some data to chart
            _fa_exp_results = exp_data.get("structured_results", {})
            if not _fa_exp_results and _best_metrics:
                _fa_exp_results = {"best_run_metrics": _best_metrics}

            # Read paper draft for Decision Agent analysis
            _paper_draft = (
                _read_prior_artifact(run_dir, "paper_draft.md")
                or _read_prior_artifact(run_dir, "outline.md")
                or ""
            )

            _fa_plan = _fa.orchestrate({
                "experiment_results": _fa_exp_results,
                "condition_summaries": _condition_summaries,
                "metrics_summary": exp_data.get("metrics_summary", {}),
                "metric_key": config.experiment.metric_key,
                "conditions": _fa_conditions,
                "topic": _read_prior_artifact(run_dir, "topic.md") or config.research.topic,
                "hypothesis": _read_prior_artifact(run_dir, "hypotheses.md") or "",
                "paper_draft": _paper_draft,
                "output_dir": str(stage_dir / "charts"),
            })

            if _fa_plan.figure_count > 0:
                # Save figure plan for Stage 17 to read
                (stage_dir / "figure_plan.json").write_text(
                    json.dumps(_fa_plan.to_dict(), indent=2, default=str),
                    encoding="utf-8",
                )
                _figure_plan_saved = True
                for _cf_name in _fa_plan.get_chart_files():
                    artifacts.append(f"charts/{_cf_name}")
                logger.info(
                    "Stage 14: FigureAgent generated %d charts (%d passed review, %.1fs)",
                    _fa_plan.figure_count,
                    _fa_plan.passed_count,
                    _fa_plan.elapsed_sec,
                )
            else:
                logger.warning("Stage 14: FigureAgent produced no charts, falling back")
        except Exception as _fa_exc:
            logger.warning("Stage 14: FigureAgent failed (%s), falling back to visualize.py", _fa_exc)

    # Fallback: legacy visualize.py chart generation
    if not _figure_plan_saved:
        try:
            from researchclaw.experiment.visualize import (
                generate_all_charts as _gen_charts_early,
            )

            _charts_dir = stage_dir / "charts"
            _early_charts = _gen_charts_early(
                run_dir,
                _charts_dir,
                metric_key=config.experiment.metric_key,
            )
            if _early_charts:
                for _cp in _early_charts:
                    artifacts.append(f"charts/{_cp.name}")
                logger.info(
                    "Stage 14: Generated %d early charts (legacy) for paper embedding",
                    len(_early_charts),
                )
        except Exception as _chart_exc:
            logger.warning("Stage 14: Early chart generation failed: %s", _chart_exc)

    return StageResult(
        stage=Stage.RESULT_ANALYSIS,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-14/{a}" for a in artifacts),
    )


def _parse_decision(text: str) -> str:
    """Extract PROCEED/PIVOT/REFINE from decision text.

    Looks for the first standalone keyword on its own line after a
    ``## Decision`` heading.  Falls back to a keyword scan of the first
    few lines after the heading, but only matches the keyword itself
    (not mentions inside explanatory prose like "PIVOT is not warranted").
    Returns lowercase ``"proceed"`` / ``"pivot"`` / ``"refine"``.
    Defaults to ``"proceed"`` if nothing matches.
    """
    import re as _re

    text_upper = text.upper()
    # Look in the first occurrence after "## Decision" heading
    decision_section = ""
    for keyword in ("## DECISION", "## Decision", "## decision"):
        if keyword.upper() in text_upper:
            idx = text_upper.index(keyword.upper())
            decision_section = text[idx : idx + 200]
            break
    search_text = decision_section or text[:500]

    # First try: look for a line that is just the keyword (possibly with
    # whitespace / markdown bold / trailing punctuation).
    for line in search_text.splitlines():
        stripped = line.strip().strip("*").strip("#").strip()
        if stripped.upper() in ("PROCEED", "PIVOT", "REFINE"):
            return stripped.lower()

    # Fallback: regex for standalone word boundaries so that
    # "PIVOT is not warranted" does NOT match as a decision.
    for kw in ("PIVOT", "REFINE", "PROCEED"):
        # Only match if the keyword appears as the FIRST keyword-class token
        # on its own (not embedded in a sentence saying "not PIVOT").
        pattern = _re.compile(
            r"(?:^|##\s*Decision\s*\n\s*)" + kw, _re.IGNORECASE | _re.MULTILINE
        )
        if pattern.search(search_text):
            return kw.lower()

    # Last resort: position-based — prefer whichever keyword appears LAST
    # (the final conclusion after deliberation is more reliable than early mentions)
    # BUG-DA8-08: Old code always returned "refine" when both keywords present
    search_upper = search_text.upper()
    last_refine = search_upper.rfind("REFINE")
    last_pivot = search_upper.rfind("PIVOT")
    if last_refine >= 0 and (last_pivot < 0 or last_refine > last_pivot):
        return "refine"
    if last_pivot >= 0 and (last_refine < 0 or last_pivot > last_refine):
        return "pivot"
    return "proceed"


def _execute_research_decision(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    analysis = _read_prior_artifact(run_dir, "analysis.md") or ""

    # P6: Detect degenerate REFINE cycles — inject warning if metrics stagnate
    _degenerate_hint = ""
    _refine_log = _read_prior_artifact(run_dir, "refinement_log.json")
    if _refine_log:
        try:
            _rl = json.loads(_refine_log)
            _iters = _rl.get("iterations", [])
            _metrics = [it.get("metric") for it in _iters if isinstance(it, dict)]
            _valid = [m for m in _metrics if m is not None]
            _all_saturated = _valid and all(m <= 0.001 or m >= 0.999 for m in _valid)
            _all_identical = len(set(_valid)) <= 1 and len(_valid) >= 2
            if _all_saturated or _all_identical:
                _degenerate_hint = (
                    "\n\nSYSTEM WARNING — DEGENERATE REFINE CYCLE DETECTED:\n"
                    f"Metrics across {len(_valid)} iterations: {_valid}\n"
                    "All iterations produce identical/saturated results. Further REFINE "
                    "cycles CANNOT fix this — the underlying benchmark design is too "
                    "easy/hard. You SHOULD choose PROCEED with a quality caveat rather "
                    "than REFINE again.\n"
                )
                logger.warning("P6: Degenerate refine cycle detected, injecting PROCEED hint")
        except (json.JSONDecodeError, OSError):
            pass

    # Phase 2: Inject experiment diagnosis into decision prompt
    _diagnosis_hint = ""
    _diag_path = run_dir / "experiment_diagnosis.json"
    if _diag_path.exists():
        try:
            _diag_data = json.loads(_diag_path.read_text(encoding="utf-8"))
            _qa = _diag_data.get("quality_assessment", {})
            _mode = _qa.get("mode", "unknown")
            _sufficient = _qa.get("sufficient", False)
            _deficiency_types = _qa.get("deficiency_types", [])
            if not _sufficient:
                _diagnosis_hint = (
                    "\n\n## EXPERIMENT DIAGNOSIS (from automated analysis)\n"
                    f"Quality mode: {_mode}\n"
                    f"Sufficient for full paper: NO\n"
                    f"Issues found: {', '.join(_deficiency_types)}\n\n"
                    "IMPORTANT: The experiment has significant issues. "
                    "If REFINE is chosen, a structured repair prompt is available "
                    "at repair_prompt.txt with specific fixes for identified issues.\n"
                    "If the same issues persist after 2+ REFINE cycles, choose PROCEED "
                    "with appropriate quality caveats.\n"
                )
                logger.info(
                    "Stage 15: Injected experiment diagnosis — mode=%s, issues=%s",
                    _mode, _deficiency_types,
                )
        except (json.JSONDecodeError, OSError):
            pass

    # Improvement C: Check ablation quality — if >50% trivial, push REFINE
    _ablation_refine_hint = ""
    # BUG-DA8-16: Prefer experiment_summary_best.json (promoted best) over
    # alphabetically-last stage-14* (which could be a stale versioned dir)
    _exp_sum_path = run_dir / "experiment_summary_best.json"
    if not _exp_sum_path.is_file():
        _exp_sum_path = None
        for _s14 in sorted(run_dir.glob("stage-14*/experiment_summary.json"), reverse=True):
            _exp_sum_path = _s14
            break
    if _exp_sum_path and _exp_sum_path.is_file():
        try:
            from researchclaw.pipeline.stage_impls._paper_writing import _check_ablation_effectiveness
            _abl_exp = json.loads(_exp_sum_path.read_text(encoding="utf-8"))
            _abl_warnings = _check_ablation_effectiveness(_abl_exp, threshold=0.02)
            if _abl_warnings:
                _trivial_count = sum(1 for w in _abl_warnings if "ineffective" in w.lower() or "trivial" in w.lower())
                _total_abl = max(1, len(_abl_warnings))
                if _trivial_count / _total_abl > 0.5:
                    _ablation_refine_hint = (
                        "\n\n## ABLATION QUALITY ASSESSMENT (CRITICAL)\n"
                        f"STRONG RECOMMENDATION: Choose REFINE.\n"
                        f"{_trivial_count}/{_total_abl} ablations show <2% difference from baseline "
                        f"(trivially similar). This means the ablation design is broken.\n"
                        "Warnings:\n" + "\n".join(f"- {w}" for w in _abl_warnings) + "\n"
                    )
                    logger.warning("C: %d/%d ablations trivial → recommending REFINE", _trivial_count, _total_abl)
        except Exception:  # noqa: BLE001
            pass

    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "research_decision")
        sp = _pm.for_stage("research_decision", evolution_overlay=_overlay, analysis=analysis)
        _user = sp.user + _degenerate_hint + _diagnosis_hint + _ablation_refine_hint
        resp = _chat_with_prompt(llm, sp.system, _user)
        decision_md = resp.content
    else:
        decision_md = f"""# Research Decision

## Decision
PROCEED

## Justification
Current evidence suggests measurable progress with actionable limitations.

## Next Actions
- Build detailed paper outline
- Expand ablation and uncertainty analysis in writing

Generated: {_utcnow_iso()}
"""
    (stage_dir / "decision.md").write_text(decision_md, encoding="utf-8")

    # --- Extract structured decision ---
    decision = _parse_decision(decision_md)

    # T3.1: Validate decision quality — check for minimum experiment rigor
    _quality_warnings: list[str] = []
    _dec_lower = decision_md.lower()
    if "baseline" not in _dec_lower and "control" not in _dec_lower:
        _quality_warnings.append("Decision text does not mention baselines")
    if "seed" not in _dec_lower and "replicat" not in _dec_lower and "run" not in _dec_lower:
        _quality_warnings.append("Decision text does not mention multi-seed/replicate runs")
    if "metric" not in _dec_lower and "accuracy" not in _dec_lower and "loss" not in _dec_lower:
        _quality_warnings.append("Decision text does not mention evaluation metrics")
    if _quality_warnings:
        logger.warning("T3.1: Decision quality warnings: %s", _quality_warnings)

    decision_payload = {
        "decision": decision,
        "raw_text_excerpt": decision_md[:500],
        "quality_warnings": _quality_warnings,
        "generated": _utcnow_iso(),
    }
    (stage_dir / "decision_structured.json").write_text(
        json.dumps(decision_payload, indent=2), encoding="utf-8"
    )
    logger.info("Research decision: %s", decision)

    return StageResult(
        stage=Stage.RESEARCH_DECISION,
        status=StageStatus.DONE,
        artifacts=("decision.md", "decision_structured.json"),
        evidence_refs=("stage-15/decision.md",),
        decision=decision,
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_code_generation.py
================================================
"""Stage 10: Code generation."""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.experiment.validator import (
    CodeValidation,
    format_issues_for_llm,
    validate_code,
)
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _chat_with_prompt,
    _ensure_sandbox_deps,
    _extract_code_block,
    _extract_multi_file_blocks,
    _extract_yaml_block,
    _get_evolution_overlay,
    _load_hardware_profile,
    _read_prior_artifact,
    _safe_json_loads,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)

# Improvement G: Continuous-action environments that are incompatible with DQN
_CONTINUOUS_ENVS = {
    "pendulum", "halfcheetah", "hopper", "walker2d", "ant", "humanoid",
    "swimmer", "reacher", "invertedpendulum", "inverteddoublependulum",
    "mountaincarcontinuous", "lunarlander-continuous",
}


def _check_rl_compatibility(code: str) -> list[str]:
    """Detect DQN + continuous-action environment mismatches.

    Returns a list of error strings if incompatible combinations are found.
    """
    errors: list[str] = []
    code_lower = code.lower()
    has_dqn = "dqn" in code_lower
    if not has_dqn:
        return errors

    for env_name in _CONTINUOUS_ENVS:
        if env_name in code_lower:
            errors.append(
                f"RL COMPATIBILITY ERROR: DQN is used with continuous-action "
                f"environment '{env_name}'. DQN only works with DISCRETE action "
                f"spaces. Use SAC, TD3, or PPO instead."
            )
    return errors


def _execute_code_generation(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or ""
    metric = config.experiment.metric_key
    max_repair = 5  # BUG-14: Increased from 3 to give more chances for critical bugs
    files: dict[str, str] = {}
    validation_log: list[str] = []

    # --- Detect available packages for sandbox ---
    _pm = prompts or PromptManager()

    # --- Hardware-aware package hint ---
    hw_profile = _load_hardware_profile(run_dir)
    if config.experiment.mode in ("sandbox", "docker"):
        if config.experiment.mode == "docker":
            pkg_prefix = "docker mode"
            _net_policy = config.experiment.docker.network_policy
            _base_pkgs = (
                ", torchvision, torchaudio, matplotlib, seaborn, scipy, "
                "tqdm, torchdiffeq, gymnasium, networkx, PyYAML, Pillow, "
                "transformers, datasets, accelerate, peft, bitsandbytes, "
                "timm, einops, torchmetrics, h5py"
            )
            if _net_policy == "none":
                pkg_extras = _base_pkgs + " (ONLY pre-installed packages — NO pip install available)"
            elif _net_policy in ("setup_only", "pip_only"):
                pkg_extras = _base_pkgs + ", and additional pip-installable packages via requirements.txt"
            else:
                pkg_extras = _base_pkgs + ", and additional pip-installable packages (auto-detected from imports)"
        else:
            pkg_prefix = "sandbox mode"
            pkg_extras = ""
        if hw_profile and hw_profile.get("has_gpu"):
            gpu_type = hw_profile.get("gpu_type", "cuda")
            gpu_name = hw_profile.get("gpu_name", "GPU")
            tier = hw_profile.get("tier", "limited")
            if tier == "high":
                device_hint = f"torch.device('{gpu_type}')"
                pkg_hint = (
                    f"\nAVAILABLE PACKAGES ({pkg_prefix}): Python stdlib, numpy, torch, sklearn, scipy, pandas{pkg_extras}.\n"
                    f"GPU: {gpu_name} ({gpu_type}). You MAY use PyTorch with GPU acceleration.\n"
                    f"Use `device = {device_hint}` for tensor operations.\n"
                )
            else:  # limited (low VRAM NVIDIA or MPS)
                device_hint = f"torch.device('{gpu_type}')"
                pkg_hint = (
                    f"\nAVAILABLE PACKAGES ({pkg_prefix}): Python stdlib, numpy, torch, sklearn, scipy, pandas{pkg_extras}.\n"
                    f"GPU: {gpu_name} ({gpu_type}) — LIMITED performance.\n"
                    f"Use `device = {device_hint}` but design LIGHTWEIGHT experiments:\n"
                    f"- Small models (<1M parameters)\n"
                    f"- Few epochs (<=20)\n"
                    f"- Small datasets (<=10K samples)\n"
                    f"- Avoid large batch sizes\n"
                )
        else:
            pkg_hint = _pm.block("pkg_hint_sandbox")
    else:
        pkg_hint = ""

    # --- Compute budget hint ---
    time_budget_sec = config.experiment.time_budget_sec
    try:
        compute_budget = _pm.block("compute_budget").replace(
            "{time_budget_sec}", str(time_budget_sec)
        )
    except Exception:  # noqa: BLE001
        compute_budget = (
            f"\n## Compute Budget Constraint\n"
            f"- Total execution time limit: {time_budget_sec} seconds\n"
            f"- Design experiments that complete within this budget\n"
            f"- Implement a time guard: stop gracefully at 80% of budget\n"
        )

    # --- Dataset guidance + setup script + HP reporting (docker/sandbox modes) ---
    extra_guidance = ""
    _net_policy = getattr(getattr(config, "docker", None), "network_policy", "setup_only")
    if config.experiment.mode in ("sandbox", "docker"):
        _net_policy = (
            config.experiment.docker.network_policy
            if config.experiment.mode == "docker"
            else "none"  # sandbox mode has no network
        )
        if _net_policy == "none":
            # Network disabled: inject strict offline-only guidance
            try:
                extra_guidance += _pm.block("network_disabled_guidance")
            except Exception:  # noqa: BLE001
                pass
        elif _net_policy == "full":
            try:
                extra_guidance += _pm.block("dataset_guidance")
                extra_guidance += _pm.block("network_full_guidance")
            except Exception:  # noqa: BLE001
                pass
        else:
            # setup_only or pip_only — existing behavior
            try:
                extra_guidance += _pm.block("dataset_guidance")
            except Exception:  # noqa: BLE001
                pass
            if config.experiment.mode == "docker":
                try:
                    extra_guidance += _pm.block("setup_script_guidance")
                except Exception:  # noqa: BLE001
                    pass
        try:
            extra_guidance += _pm.block("hp_reporting")
        except Exception:  # noqa: BLE001
            pass
        # I-06: Multi-seed enforcement for all experiments
        try:
            extra_guidance += _pm.block("multi_seed_enforcement")
        except Exception:  # noqa: BLE001
            pass

    # --- BA: Inject BenchmarkAgent plan from Stage 9 ---
    _bp_path = None
    for _s9_dir in sorted(run_dir.glob("stage-09*"), reverse=True):
        _candidate = _s9_dir / "benchmark_plan.json"
        if _candidate.exists():
            _bp_path = _candidate
            break
    if _bp_path is not None:
        try:
            import json as _json_bp
            _bp_data = _json_bp.loads(_bp_path.read_text(encoding="utf-8"))
            # Reconstruct the prompt block
            from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan
            _bp = BenchmarkPlan(
                selected_benchmarks=_bp_data.get("selected_benchmarks", []),
                selected_baselines=_bp_data.get("selected_baselines", []),
                data_loader_code=_bp_data.get("data_loader_code", ""),
                baseline_code=_bp_data.get("baseline_code", ""),
                experiment_notes=_bp_data.get("experiment_notes", ""),
            )
            _bp_block = _bp.to_prompt_block()
            if _bp_block:
                extra_guidance += (
                    "\n\n## BenchmarkAgent Selections (USE THESE)\n"
                    "The following datasets, baselines, and code snippets were "
                    "automatically selected and validated by the BenchmarkAgent. "
                    "You MUST use these selections in your experiment code.\n\n"
                    + _bp_block
                )
                logger.info(
                    "BA: Injected benchmark plan (%d benchmarks, %d baselines)",
                    len(_bp.selected_benchmarks), len(_bp.selected_baselines),
                )
        except Exception as _bp_exc:
            logger.debug("BA: Failed to load benchmark plan: %s", _bp_exc)

    # --- P2.2+P2.3: LLM training topic detection and guidance ---
    _llm_keywords = (
        "language model", "llm", "fine-tun", "lora", "qlora", "peft",
        "instruction tun", "rlhf", "dpo", "sft", "alignment",
        "transformer train", "causal lm", "chat model", "qwen", "llama",
        "mistral", "phi-", "gemma", "pretraining", "tokeniz",
    )
    topic_lower = config.research.topic.lower()
    is_llm_topic = any(kw in topic_lower for kw in _llm_keywords)

    # --- I-08: RL topic detection and step guidance ---
    _rl_keywords = (
        "reinforcement learning", "policy gradient", "ppo", "sac", "td3",
        "ddpg", "dqn", "a2c", "a3c", "mujoco", "locomotion", "continuous control",
        "reward shaping", "exploration", "multi-agent rl", "marl", "curriculum rl",
        "imitation learning", "inverse rl", "offline rl", "model-based rl",
        "actor-critic", "reinforce", "gym", "gymnasium",
    )
    is_rl_topic = any(kw in topic_lower for kw in _rl_keywords)
    if is_rl_topic:
        try:
            extra_guidance += _pm.block("rl_step_guidance")
        except Exception:  # noqa: BLE001
            pass

    # --- F-01: Framework API doc injection (auto-detected) ---
    try:
        from researchclaw.data import detect_frameworks, load_framework_docs
        _hypothesis_text = _read_prior_artifact(run_dir, "hypotheses.md") or ""
        _fw_ids = detect_frameworks(
            config.research.topic, _hypothesis_text, exp_plan or ""
        )
        if _fw_ids:
            _fw_docs = load_framework_docs(_fw_ids, max_chars=8000)
            if _fw_docs:
                extra_guidance += _fw_docs
                logger.info("F-01: Injected framework docs for: %s", _fw_ids)
    except Exception:  # noqa: BLE001
        logger.debug("F-01: Framework doc injection skipped", exc_info=True)

    if is_llm_topic and config.experiment.mode == "docker":
        try:
            extra_guidance += _pm.block("llm_training_guidance")
        except Exception:  # noqa: BLE001
            pass
        try:
            extra_guidance += _pm.block("llm_eval_guidance")
        except Exception:  # noqa: BLE001
            pass
        # P2.3: Warn if time budget is too short for LLM training
        if time_budget_sec < 3600:
            extra_guidance += (
                "\n## COMPUTE BUDGET WARNING\n"
                f"Current time_budget_sec={time_budget_sec} is likely TOO SHORT "
                f"for LLM fine-tuning. Typical LoRA training needs 1-4 hours. "
                f"Design a LIGHTWEIGHT experiment:\n"
                f"- Use a small dataset (<=5000 samples)\n"
                f"- Train for 1-3 epochs only\n"
                f"- Use small batch size (1-2) with gradient accumulation\n"
                f"- Use 4-bit quantization (QLoRA) to minimize memory\n"
                f"- Limit max_seq_length to 512-1024\n"
                f"- If possible, use a smaller model (<=7B parameters)\n"
            )

    # --- Domain-specific guidance injection for non-ML domains ---
    try:
        from researchclaw.domains.detector import detect_domain as _dd_s10, is_ml_domain as _is_ml_s10
        _dp = _dd_s10(topic=config.research.topic)
        if not _is_ml_s10(_dp):
            from researchclaw.domains.prompt_adapter import get_adapter as _ga
            _adapter = _ga(_dp)
            _blocks = _adapter.get_code_generation_blocks({})
            if _blocks.compute_budget:
                compute_budget = _blocks.compute_budget
            if _blocks.dataset_guidance:
                extra_guidance = _blocks.dataset_guidance + "\n" + extra_guidance
            if _blocks.code_generation_hints:
                extra_guidance += "\n" + _blocks.code_generation_hints
            if _blocks.output_format_guidance:
                extra_guidance += "\n" + _blocks.output_format_guidance
            logger.info("Injected domain-specific guidance for %s", _dp.domain_id)
    except Exception:  # noqa: BLE001
        logger.debug("Domain guidance injection skipped", exc_info=True)

    # BUG-R6-01: Add explicit implementation constraints to prevent LLM
    # from substituting unrelated DL models for lightweight algorithms.
    extra_guidance += (
        "\n\nIMPLEMENTATION CONSTRAINTS (MUST FOLLOW):\n"
        "- Implement EXACTLY the algorithm/method described in the topic.\n"
        "- Do NOT replace the stated method with a deep-learning proxy "
        "(e.g. ResNet, BERT, GPT, Gymnasium+SB3) unless the topic "
        "EXPLICITLY requires deep learning.\n"
        "- Prefer lightweight CPU-friendly libraries (numpy, scipy, "
        "sklearn, pandas) unless deep learning is inherent to the topic.\n"
        "- The experiment MUST be self-contained and runnable without GPU.\n"
    )

    # --- Code generation: Beast Mode → CodeAgent → Legacy single-shot ---
    _code_agent_active = False
    _beast_mode_used = False
    _code_max_tokens = 8192

    # ── Beast Mode: OpenCode external agent (optional) ─────────────────
    _oc_cfg = config.experiment.opencode
    if _oc_cfg.enabled:
        from researchclaw.pipeline.opencode_bridge import (
            OpenCodeBridge,
            OpenCodeResult,
            count_historical_failures,
            score_complexity,
        )

        _hist_failures = count_historical_failures(run_dir)
        _cplx = score_complexity(
            exp_plan=exp_plan,
            topic=config.research.topic,
            historical_failures=_hist_failures,
            threshold=_oc_cfg.complexity_threshold,
        )

        # Persist complexity analysis
        (stage_dir / "complexity_analysis.json").write_text(
            json.dumps(
                {
                    "score": _cplx.score,
                    "signals": _cplx.signals,
                    "recommendation": _cplx.recommendation,
                    "reason": _cplx.reason,
                    "threshold": _oc_cfg.complexity_threshold,
                    "historical_failures": _hist_failures,
                },
                indent=2,
            ),
            encoding="utf-8",
        )

        if _cplx.recommendation == "beast_mode":
            _proceed = _oc_cfg.auto
            if not _proceed:
                # Non-auto mode: check for HITL adapter
                if adapters.hitl is not None:
                    try:
                        _proceed = adapters.hitl.confirm(
                            f"Beast Mode: complexity={_cplx.score:.2f} "
                            f"(threshold={_oc_cfg.complexity_threshold}). "
                            f"Route to OpenCode?"
                        )
                    except Exception:  # noqa: BLE001
                        logger.info(
                            "Beast mode: HITL adapter unavailable, skipping "
                            "(set opencode.auto=true for non-interactive runs)"
                        )
                else:
                    logger.info(
                        "Beast mode: no HITL adapter, skipping "
                        "(set opencode.auto=true for non-interactive runs)"
                    )

            if _proceed:
                _oc_model = _oc_cfg.model or config.llm.primary_model
                _bridge = OpenCodeBridge(
                    model=_oc_model,
                    llm_base_url=config.llm.base_url,
                    api_key_env=config.llm.api_key_env,
                    llm_provider=config.llm.provider,
                    timeout_sec=_oc_cfg.timeout_sec,
                    max_retries=_oc_cfg.max_retries,
                    workspace_cleanup=_oc_cfg.workspace_cleanup,
                )

                logger.info(
                    "Beast mode: ENGAGED (complexity=%.2f, model=%s)",
                    _cplx.score,
                    _oc_model,
                )

                _oc_result: OpenCodeResult = _bridge.generate(
                    stage_dir=stage_dir,
                    topic=config.research.topic,
                    exp_plan=exp_plan,
                    metric=metric,
                    pkg_hint=pkg_hint + "\n" + compute_budget,
                    extra_guidance=extra_guidance,
                    time_budget_sec=config.experiment.time_budget_sec,
                )

                # Persist beast mode log
                (stage_dir / "beast_mode_log.json").write_text(
                    json.dumps(
                        {
                            "success": _oc_result.success,
                            "elapsed_sec": _oc_result.elapsed_sec,
                            "files": list(_oc_result.files.keys()),
                            "error": _oc_result.error,
                            "complexity_score": _cplx.score,
                            "model": _oc_model,
                        },
                        indent=2,
                    ),
                    encoding="utf-8",
                )

                if _oc_result.success and _oc_result.files:
                    files = _oc_result.files
                    _beast_mode_used = True
                    _code_agent_active = True  # skip legacy path
                    logger.info(
                        "Beast mode: SUCCESS — %d files in %.1fs",
                        len(files),
                        _oc_result.elapsed_sec,
                    )
                else:
                    logger.warning(
                        "Beast mode: FAILED (%s) — falling back to CodeAgent",
                        _oc_result.error or "unknown error",
                    )
        else:
            logger.info(
                "Beast mode: complexity=%.2f (threshold=%.2f), not triggered",
                _cplx.score,
                _oc_cfg.complexity_threshold,
            )

    if not _beast_mode_used and config.experiment.code_agent.enabled and llm is not None:
        # ── F-02: Advanced Code Agent path ────────────────────────────────
        from researchclaw.pipeline.code_agent import CodeAgent as _CodeAgent

        _ca_cfg = config.experiment.code_agent
        # Ensure we have a proper config object
        if not hasattr(_ca_cfg, "enabled"):
            from researchclaw.pipeline.code_agent import (
                CodeAgentConfig as _CAConfig,
            )
            _ca_cfg = _CAConfig()

        # Sandbox factory (only for sandbox/docker modes)
        _sandbox_factory = None
        if config.experiment.mode in ("sandbox", "docker"):
            from researchclaw.experiment.factory import (
                create_sandbox as _csb,
            )
            _sandbox_factory = _csb

        if any(
            config.llm.primary_model.startswith(p)
            for p in ("gpt-5", "o3", "o4")
        ):
            _code_max_tokens = 16384

        # ── Domain detection + Code Search for non-ML domains ──────────
        _domain_profile = None
        _code_search_result = None
        try:
            from researchclaw.domains.detector import detect_domain as _dd
            from researchclaw.domains.detector import is_ml_domain as _is_ml
            _domain_profile = _dd(topic=config.research.topic)
            logger.info(
                "CodeAgent: domain=%s (%s)",
                _domain_profile.display_name,
                _domain_profile.domain_id,
            )
            # Run code search for non-ML domains (ML has enough built-in knowledge)
            if not _is_ml(_domain_profile):
                try:
                    from researchclaw.agents.code_searcher import CodeSearchAgent
                    _cs_agent = CodeSearchAgent(llm=llm)
                    _code_search_result = _cs_agent.search(
                        topic=config.research.topic,
                        domain=_domain_profile,
                    )
                    if _code_search_result and _code_search_result.patterns.has_content:
                        logger.info(
                            "Code search: %d patterns, %d repos found",
                            len(_code_search_result.patterns.api_patterns),
                            len(_code_search_result.repos_found),
                        )
                except Exception:  # noqa: BLE001
                    logger.debug("Code search unavailable", exc_info=True)
        except Exception:  # noqa: BLE001
            logger.debug("Domain detection unavailable", exc_info=True)

        _agent = _CodeAgent(
            llm=llm,
            prompts=_pm,
            config=_ca_cfg,
            stage_dir=stage_dir,
            sandbox_factory=_sandbox_factory,
            experiment_config=config.experiment,
            domain_profile=_domain_profile,
            code_search_result=_code_search_result,
        )
        _agent_result = _agent.generate(
            topic=config.research.topic,
            exp_plan=exp_plan,
            metric=metric,
            pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
            max_tokens=_code_max_tokens,
        )
        files = _agent_result.files
        _code_agent_active = True

        # Write agent artifacts
        (stage_dir / "code_agent_log.json").write_text(
            json.dumps(
                {
                    "log": _agent_result.validation_log,
                    "llm_calls": _agent_result.total_llm_calls,
                    "sandbox_runs": _agent_result.total_sandbox_runs,
                    "best_score": _agent_result.best_score,
                    "tree_nodes_explored": _agent_result.tree_nodes_explored,
                    "review_rounds": _agent_result.review_rounds,
                },
                indent=2,
            ),
            encoding="utf-8",
        )
        if _agent_result.architecture_spec:
            (stage_dir / "architecture_spec.yaml").write_text(
                _agent_result.architecture_spec, encoding="utf-8",
            )
        logger.info(
            "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f",
            _agent_result.total_llm_calls,
            _agent_result.total_sandbox_runs,
            _agent_result.best_score,
        )
    elif not _beast_mode_used and llm is not None:
        # ── Legacy single-shot generation ─────────────────────────────────
        topic = config.research.topic
        _md = config.experiment.metric_direction
        _md_hint = (
            f"`{_md}` — use direction={'lower' if _md == 'minimize' else 'higher'} "
            f"in METRIC_DEF. You MUST NOT use the opposite direction."
        )
        _overlay = _get_evolution_overlay(run_dir, "code_generation")
        sp = _pm.for_stage(
            "code_generation",
            evolution_overlay=_overlay,
            topic=topic,
            metric=metric,
            pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
            exp_plan=exp_plan,
            metric_direction_hint=_md_hint,
        )
        # R13-3: Use higher max_tokens for reasoning models (they consume tokens
        # for internal chain-of-thought). Retry once with even higher limit on empty.
        _code_max_tokens = sp.max_tokens or 8192
        if any(config.llm.primary_model.startswith(p) for p in ("gpt-5", "o3", "o4")):
            _code_max_tokens = max(_code_max_tokens, 16384)

        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=_code_max_tokens,
        )
        files = _extract_multi_file_blocks(resp.content)
        if not files and not resp.content.strip():
            # Empty response — retry with higher token limit
            logger.warning(
                "R13-3: Empty LLM response for code_generation (len=%d, "
                "finish_reason=%s, tokens=%d). Retrying with 32768 tokens.",
                len(resp.content),
                resp.finish_reason,
                resp.total_tokens,
            )
            resp = _chat_with_prompt(
                llm,
                sp.system,
                sp.user,
                json_mode=sp.json_mode,
                max_tokens=32768,
            )
            files = _extract_multi_file_blocks(resp.content)
        if not files:
            logger.warning(
                "R13-2: _extract_multi_file_blocks returned empty. "
                "LLM response length=%d, first 300 chars: %s",
                len(resp.content),
                resp.content[:300],
            )

    # --- Fallback: generic numerical experiment ---
    if not files:
        files = {
            "main.py": (
                "import numpy as np\n"
                "\n"
                "np.random.seed(42)\n"
                "\n"
                "# Fallback experiment: parameter sweep on a synthetic objective\n"
                "# This runs when LLM code generation fails to produce valid code.\n"
                "dim = 10\n"
                "n_conditions = 3\n"
                "results = {}\n"
                "\n"
                "for cond_idx in range(n_conditions):\n"
                "    cond_name = f'condition_{cond_idx}'\n"
                "    scores = []\n"
                "    for seed in range(3):\n"
                "        rng = np.random.RandomState(seed + cond_idx * 100)\n"
                "        x = rng.randn(dim)\n"
                "        score = float(1.0 / (1.0 + np.sum(x ** 2)))\n"
                "        scores.append(score)\n"
                "    mean_score = float(np.mean(scores))\n"
                "    results[cond_name] = mean_score\n"
                f"    print(f'condition={{cond_name}} {metric}: {{mean_score:.6f}}')\n"
                "\n"
                "best = max(results, key=results.get)\n"
                f"print(f'{metric}: {{results[best]:.6f}}')\n"
            )
        }

    # --- Validate each file + auto-repair loop ---
    all_valid = True
    attempt = 0
    for fname, code in list(files.items()):
        # Skip non-Python files (requirements.txt, setup.py, etc.)
        if not fname.endswith(".py"):
            continue
        validation = validate_code(code)
        repair_attempt = 0
        while not validation.ok and llm is not None and repair_attempt < max_repair:
            repair_attempt += 1
            attempt += 1
            # Only send errors to the LLM — warnings don't block validation
            # and confuse the LLM into over-correcting (e.g. removing runtime imports)
            errors_only = type(validation)(
                issues=[i for i in validation.issues if i.severity == "error"]
            )
            issues_text = format_issues_for_llm(errors_only)
            validation_log.append(
                f"File {fname} attempt {repair_attempt}: {validation.summary()}"
            )
            logger.info(
                "Code validation failed for %s (attempt %d/%d): %s",
                fname,
                repair_attempt,
                max_repair,
                validation.summary(),
            )
            all_files_ctx = "\n\n".join(
                f"```filename:{f}\n{c}\n```" for f, c in files.items()
            )
            rp = _pm.sub_prompt(
                "code_repair",
                fname=fname,
                issues_text=issues_text,
                all_files_ctx=all_files_ctx,
            )
            resp = _chat_with_prompt(llm, rp.system, rp.user)
            _repaired = _extract_code_block(resp.content)
            if _repaired.strip():
                files[fname] = _repaired
            else:
                logger.warning("Repair attempt returned empty code, keeping original")
            validation = validate_code(files[fname])
        if not validation.ok:
            all_valid = False
            # BUG-14: Log remaining issues prominently
            logger.warning(
                "Code validation FAILED for %s after %d repair attempts: %s",
                fname, max_repair, validation.summary(),
            )

    # Improvement G: RL algorithm-environment compatibility check
    for fname, code in list(files.items()):
        if not fname.endswith(".py"):
            continue
        _rl_errors = _check_rl_compatibility(code)
        if _rl_errors:
            for _rl_err in _rl_errors:
                logger.error("Stage 10: %s (in %s)", _rl_err, fname)
                validation_log.append(f"RL_COMPAT: {fname}: {_rl_err}")
            all_valid = False

    # BUG-14: Block on critical validation failures (syntax/import errors)
    if not all_valid:
        _has_critical = False
        for fname, code in files.items():
            _v = validate_code(code)
            if not _v.ok:
                for issue in _v.issues:
                    if issue.severity == "error" and issue.category in (
                        "syntax", "import",
                    ):
                        _has_critical = True
        if _has_critical:
            logger.error(
                "Stage 10: CRITICAL validation issues remain after %d repair "
                "attempts. Blocking stage.", max_repair,
            )
            (stage_dir / "validation_report.md").write_text(
                "# Code Validation Report\n\n"
                f"**Status**: BLOCKED — critical issues remain after {max_repair} repairs\n\n"
                + "\n".join(f"- {e}" for e in validation_log),
                encoding="utf-8",
            )
            return StageResult(
                stage=Stage.CODE_GENERATION,
                status=StageStatus.FAILED,
                artifacts=("validation_report.md",),
                evidence_refs=(),
            )

    # --- BUG-184: Cross-import validation — warn if a .py file imports a
    # local module that doesn't exist in the files dict.  This catches the
    # case where Beast Mode/CodeAgent produced an intermediate file that
    # got lost during repair iterations.
    _known_modules = {
        f.replace(".py", "") for f in files if f.endswith(".py")
    }
    _stdlib_and_common = {
        "os", "sys", "json", "math", "time", "copy", "re", "random",
        "pathlib", "argparse", "logging", "collections", "functools",
        "itertools", "abc", "typing", "dataclasses", "enum", "io",
        "csv", "pickle", "glob", "shutil", "subprocess", "datetime",
        "numpy", "np", "torch", "torchvision", "gymnasium", "gym",
        "sklearn", "scipy", "pandas", "matplotlib", "PIL", "tqdm",
        "einops", "timm", "transformers", "datasets", "peft",
        "stable_baselines3",
    }
    for fname, code in list(files.items()):
        if not fname.endswith(".py"):
            continue
        for _m in re.findall(
            r"^(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
            code, re.MULTILINE,
        ):
            if (_m not in _known_modules
                    and _m not in _stdlib_and_common
                    and not _m.startswith("_")):
                logger.warning(
                    "BUG-184: %s imports '%s' which is not in generated "
                    "files — experiment may crash on import",
                    fname, _m,
                )

    # --- Write experiment directory ---
    exp_dir = stage_dir / "experiment"
    exp_dir.mkdir(parents=True, exist_ok=True)
    for fname, code in files.items():
        (exp_dir / fname).write_text(code, encoding="utf-8")

    # --- Write validation report ---
    if validation_log or not all_valid:
        report_lines = ["# Code Validation Report\n"]
        if all_valid:
            report_lines.append(f"**Status**: PASSED after {attempt} total repair(s)\n")
        else:
            report_lines.append(
                f"**Status**: FAILED after {attempt} total repair attempt(s)\n"
            )
        for entry in validation_log:
            report_lines.append(f"- {entry}")
        (stage_dir / "validation_report.md").write_text(
            "\n".join(report_lines), encoding="utf-8"
        )

    # --- R10-Fix6: Code complexity and quality check ---
    from researchclaw.experiment.validator import (
        auto_fix_unbound_locals,
        check_code_complexity,
        deep_validate_files,
    )

    # --- BUG-3 fix: Programmatic auto-fix for UnboundLocalError patterns ---
    _total_ub_fixes = 0
    for fname, code in list(files.items()):
        if fname.endswith(".py"):
            fixed_code, n_fixes = auto_fix_unbound_locals(code)
            if n_fixes > 0:
                files[fname] = fixed_code
                (exp_dir / fname).write_text(fixed_code, encoding="utf-8")
                _total_ub_fixes += n_fixes
                logger.info(
                    "Stage 10: auto-fixed %d UnboundLocalError risk(s) in %s",
                    n_fixes, fname,
                )
    if _total_ub_fixes:
        logger.info(
            "Stage 10: auto-fixed %d total UnboundLocalError risks", _total_ub_fixes
        )

    complexity_warnings: list[str] = []
    for fname, code in files.items():
        if fname.endswith(".py"):
            cw = check_code_complexity(code)
            for w in cw:
                complexity_warnings.append(f"[{fname}] {w}")
                logger.warning("Stage 10 code quality: [%s] %s", fname, w)

    # --- P1.1+P1.2: Deep quality analysis (class quality, scoping, API) ---
    deep_warnings = deep_validate_files(files)
    for w in deep_warnings:
        logger.warning("Stage 10 deep quality: %s", w)
    complexity_warnings.extend(deep_warnings)

    # --- P1.2: If critical deep issues found, attempt one repair cycle ---
    critical_deep = [w for w in deep_warnings if any(
        kw in w for kw in ("UnboundLocalError", "unregistered", "does not exist",
                           "empty or trivial subclass", "does NOT override",
                           "Import-usage mismatch", "NameError",
                           "was removed", "ptp()",
                           "copy-paste", "identical method signatures",
                           "identical AST", "NOT a real ablation",
                           "shadows stdlib/pip")
    )]
    if critical_deep and llm is not None:
        logger.info(
            "Stage 10: %d critical code issues found — triggering repair cycle",
            len(critical_deep),
        )
        repair_issues = "\n".join(f"- {w}" for w in critical_deep)
        all_code_ctx = "\n\n".join(
            f"```filename:{f}\n{c}\n```" for f, c in files.items()
        )
        repair_prompt = (
            f"CRITICAL CODE QUALITY ISSUES FOUND:\n{repair_issues}\n\n"
            f"Fix ALL these issues in the code below. Return the complete "
            f"corrected files using ```filename:xxx.py format.\n\n"
            f"RULES:\n"
            f"- nn.Linear/nn.Conv must be created in __init__(), not forward()\n"
            f"- Variables used after if/else must be defined before the branch\n"
            f"- Use scipy.special.erf, not np.erf\n"
            f"- Ablation/variant classes must have genuinely different logic\n"
            f"- Every class must have a real implementation, not just `pass`\n"
            f"- Ablation classes MUST override the parent method that implements "
            f"the component being ablated (e.g., if ablating attention, override "
            f"the attention method with a simpler alternative like mean pooling)\n"
            f"- IMPORT CONSISTENCY: if you write `from X import Y`, call `Y()` "
            f"directly — NOT `X.Y()`. Mixing styles causes NameError.\n"
            f"- NumPy 2.0: ndarray.ptp() was removed — use arr.max()-arr.min()\n"
            f"- NumPy 2.0: np.bool/np.int/np.float removed — use builtins\n"
            f"- Pretrained models (EfficientNet, ResNet, ViT) expect 224×224 input "
            f"— add `transforms.Resize(224)` when using CIFAR (32×32) or similar\n"
            f"- Copy-paste ablation: if two classes have identical bodies, REWRITE "
            f"the ablation to genuinely remove/reduce a component (e.g., zero out "
            f"attention weights, halve hidden dimensions, remove a loss term)\n"
            f"- KD: teacher must be frozen, add projection layers if teacher_dim != "
            f"student_dim, use temperature T=4 for soft targets\n"
            f"- FILENAME COLLISIONS: If a file like config.py shadows a pip/stdlib "
            f"package, rename it (e.g., config.py → experiment_config.py) and update "
            f"ALL imports referencing it\n\n"
            f"Current code:\n{all_code_ctx}\n"
        )
        try:
            repair_resp = _chat_with_prompt(
                llm,
                _pm.system("code_generation"),
                repair_prompt,
                max_tokens=_code_max_tokens,
            )
            repaired = _extract_multi_file_blocks(repair_resp.content)
            if repaired and "main.py" in repaired:
                files = repaired
                for fname, code in files.items():
                    (exp_dir / fname).write_text(code, encoding="utf-8")
                # Re-check after repair
                deep_warnings_after = deep_validate_files(files)
                fixed = len(critical_deep) - len([
                    w for w in deep_warnings_after
                    if any(kw in w for kw in (
                        "UnboundLocalError", "unregistered", "does not exist",
                        "empty or trivial subclass", "does NOT override",
                        "Import-usage mismatch", "NameError",
                        "was removed", "ptp()",
                        "copy-paste", "identical method signatures",
                        "identical AST", "NOT a real ablation",
                        "shadows stdlib/pip",
                    ))
                ])
                logger.info(
                    "Stage 10: Deep repair fixed %d/%d critical issues",
                    fixed, len(critical_deep),
                )
                complexity_warnings.append(
                    f"[REPAIR] Deep repair fixed {fixed}/{len(critical_deep)} "
                    f"critical issues"
                )
        except Exception as exc:
            logger.debug("Deep repair failed: %s", exc)

    if complexity_warnings:
        health: dict[str, Any] = {}
        health["code_complexity_warnings"] = complexity_warnings
        (stage_dir / "code_complexity.json").write_text(
            json.dumps(health, indent=2), encoding="utf-8"
        )

    # --- P1.4: LLM Code Review (Stage 10.5) ---
    # Skip when CodeAgent is active — Phase 4 review already covers this.
    if llm is not None and not _code_agent_active:
        all_code_review = "\n\n".join(
            f"# --- {fname} ---\n{code}" for fname, code in files.items()
        )
        if len(all_code_review) > 12000:
            all_code_review = all_code_review[:12000] + "\n... [truncated]"
        review_prompt = (
            f"You are a senior researcher reviewing experiment code for a "
            f"research submission.\n\n"
            f"TOPIC: {config.research.topic}\n"
            f"EXPERIMENT PLAN:\n{exp_plan[:3000]}\n\n"
            f"CODE:\n```python\n{all_code_review}\n```\n\n"
            f"Review the code and return JSON with this EXACT structure:\n"
            f'{{"score": <1-10>, "issues": ['
            f'{{"severity": "critical|major|minor", '
            f'"description": "...", "fix": "..."}}], '
            f'"verdict": "pass|needs_fix"}}\n\n'
            f"Check specifically:\n"
            f"1. Does each algorithm/method have a DISTINCT implementation? "
            f"(Not just renamed copies)\n"
            f"2. Are ablation conditions genuinely different from the main method?\n"
            f"3. Are loss functions / training loops mathematically correct?\n"
            f"4. Will the code actually run without errors? Check variable scoping, "
            f"API usage, tensor shape compatibility.\n"
            f"5. Is the code complex enough for a research paper? (Not trivial)\n"
            f"6. Are experimental conditions fairly compared (same seeds, data)?\n"
            f"7. If using pretrained models (EfficientNet, ResNet, ViT), are input "
            f"images resized to the model's expected size (e.g., 224x224)? CIFAR "
            f"images are 32x32 and MUST be resized for pretrained models.\n"
            f"8. Are imports consistent? `from X import Y` must use `Y()`, not `X.Y()`.\n"
        )
        try:
            review_resp = llm.chat(
                [{"role": "user", "content": review_prompt}],
                system="You are a meticulous ML code reviewer. Be strict.",
                max_tokens=2048,
            )
            # Extract JSON from LLM response (may be wrapped in markdown fences)
            _review_text = review_resp.content if hasattr(review_resp, "content") else str(review_resp)
            # Strip markdown JSON fences if present
            _review_text = _review_text.strip()
            if _review_text.startswith("```"):
                _lines = _review_text.splitlines()
                _start = 1 if _lines[0].strip().startswith("```") else 0
                _end = len(_lines) - 1 if _lines[-1].strip() == "```" else len(_lines)
                _review_text = "\n".join(_lines[_start:_end])
            review_data = _safe_json_loads(_review_text, {})
            if isinstance(review_data, dict):
                review_score = review_data.get("score", 0)
                review_verdict = review_data.get("verdict", "unknown")
                review_issues = review_data.get("issues", [])

                # Write review report
                review_report = {
                    "score": review_score,
                    "verdict": review_verdict,
                    "issues": review_issues,
                    "timestamp": _utcnow_iso(),
                }
                (stage_dir / "code_review.json").write_text(
                    json.dumps(review_report, indent=2), encoding="utf-8"
                )

                # If critical issues found and score low, attempt fix
                critical_issues = [
                    i for i in review_issues
                    if isinstance(i, dict)
                    and i.get("severity") == "critical"
                ]
                if critical_issues and review_score <= 4:
                    logger.warning(
                        "Stage 10 code review: score=%d, %d critical issues — "
                        "attempting fix",
                        review_score, len(critical_issues),
                    )
                    fix_descriptions = "\n".join(
                        f"- [{i.get('severity', '?')}] {i.get('description', '?')}: "
                        f"{i.get('fix', 'no fix suggested')}"
                        for i in critical_issues
                    )
                    fix_prompt = (
                        f"Code review found {len(critical_issues)} CRITICAL issues "
                        f"(score: {review_score}/10):\n{fix_descriptions}\n\n"
                        f"Fix ALL critical issues. Return complete corrected files "
                        f"using ```filename:xxx.py format.\n\n"
                        f"Current code:\n"
                        + "\n\n".join(
                            f"```filename:{f}\n{c}\n```" for f, c in files.items()
                        )
                    )
                    try:
                        fix_resp = _chat_with_prompt(
                            llm,
                            _pm.system("code_generation"),
                            fix_prompt,
                            max_tokens=_code_max_tokens,
                        )
                        fixed_files = _extract_multi_file_blocks(fix_resp.content)
                        if fixed_files and "main.py" in fixed_files:
                            files = fixed_files
                            for fname, code in files.items():
                                (exp_dir / fname).write_text(code, encoding="utf-8")
                            logger.info(
                                "Stage 10: Code fixed after review "
                                "(was %d/10, %d critical issues)",
                                review_score, len(critical_issues),
                            )
                    except Exception as exc:
                        logger.debug("Review-fix failed: %s", exc)
        except Exception as exc:
            logger.debug("Code review failed: %s", exc)

    # --- FIX-3: Topic-experiment alignment check ---
    # BUG-171: Previous 8000-char truncation caused false-positive misalignment
    # for multi-file experiments (30-90K chars). LLM saw "[truncated]" and
    # concluded code was incomplete. Fix: build a structured summary that
    # includes file inventory + full main.py + per-file function/class headers.
    alignment_ok = True
    alignment_note = ""
    if llm is not None:
        # Build structured code summary for alignment check
        _file_inventory = []
        for _fn, _cd in files.items():
            _lines = _cd.count("\n") + 1
            _file_inventory.append(f"  {_fn}: {_lines} lines, {len(_cd)} chars")
        _inventory_block = "FILES GENERATED:\n" + "\n".join(_file_inventory)

        # BUG-179: Beast Mode may use a different entry point (e.g.
        # run_experiment.py).  Detect the actual entry point by scanning
        # for ``if __name__ == "__main__"`` in all files, preferring main.py.
        _entry_file = "main.py"
        if "main.py" not in files or not files.get("main.py", "").strip():
            for _fn, _cd in files.items():
                if 'if __name__' in _cd and '__main__' in _cd:
                    _entry_file = _fn
                    break
        elif files.get("main.py", ""):
            # main.py exists but may be a stub — if another file has the
            # real orchestration (more lines + __main__ guard), prefer it
            _main_lines = files["main.py"].count("\n")
            for _fn, _cd in files.items():
                if _fn == "main.py":
                    continue
                if ('if __name__' in _cd and '__main__' in _cd
                        and _cd.count("\n") > _main_lines * 1.5):
                    _entry_file = _fn
                    break

        _main_code = files.get(_entry_file, files.get("main.py", ""))
        _main_block = f"# --- {_entry_file} (FULL — entry point) ---\n{_main_code}"
        # Cap main.py at 12000 chars to stay within token budget
        if len(_main_block) > 12000:
            _main_block = _main_block[:12000] + "\n... [main.py truncated at 12000 chars]"

        # For other files, include imports + function/class signatures
        _other_summaries = []
        for _fn, _cd in files.items():
            if _fn == _entry_file:
                continue
            _sig_lines = []
            for _line in _cd.split("\n"):
                _stripped = _line.strip()
                if (_stripped.startswith("def ") or _stripped.startswith("class ")
                        or _stripped.startswith("async def ")
                        # BUG-209: Include import lines — they reveal which
                        # techniques/libraries are used (e.g. CosineAnnealingLR)
                        or _stripped.startswith("import ")
                        or _stripped.startswith("from ")):
                    _sig_lines.append(_line)
            if _sig_lines:
                _other_summaries.append(
                    f"# --- {_fn} (imports + signatures) ---\n"
                    + "\n".join(_sig_lines)
                )
            else:
                # Small file — include first 800 chars
                _preview = _cd[:800]
                if len(_cd) > 800:
                    _preview += f"\n... [{len(_cd) - 800} more chars]"
                _other_summaries.append(f"# --- {_fn} (preview) ---\n{_preview}")
        _other_block = "\n\n".join(_other_summaries)
        # Cap other summaries
        if len(_other_block) > 6000:
            _other_block = _other_block[:6000] + "\n... [other files truncated]"

        all_code_for_check = (
            f"{_inventory_block}\n\n{_main_block}\n\n{_other_block}"
        )
        align_prompt = (
            f"Research topic: {config.research.topic}\n\n"
            f"Experiment code:\n```python\n{all_code_for_check}\n```\n\n"
            "TASK: Evaluate whether this experiment code actually tests the "
            "stated research topic. Answer with JSON:\n"
            '{"aligned": true/false, "reason": "...", "suggestions": "..."}\n\n'
            "IMPORTANT: The code spans MULTIPLE files. The file inventory above "
            "shows ALL generated files. Only main.py is shown in full; other "
            "files show function/class signatures. Do NOT mark as misaligned "
            "just because helper files are summarized — they contain full "
            "implementations.\n\n"
            "Check specifically:\n"
            "- Does main.py orchestrate an experiment matching the topic?\n"
            "- Do the helper file signatures indicate relevant models/methods?\n"
            "- If the topic mentions a specific technique, is there evidence of "
            "its implementation (function names, class names, imports)?\n"
            "- Are the experimental conditions meaningfully different from each other?\n"
        )
        try:
            align_resp = llm.chat(
                [{"role": "user", "content": align_prompt}],
                system="You are a scientific code reviewer checking topic-experiment alignment.",
                max_tokens=1024,
            )
            align_data = _safe_json_loads(align_resp.content, {})
            if isinstance(align_data, dict) and not align_data.get("aligned", True):
                alignment_ok = False
                alignment_note = align_data.get("reason", "Misaligned")
                suggestions = align_data.get("suggestions", "")
                logger.warning(
                    "Stage 10: Topic-experiment MISALIGNMENT detected: %s",
                    alignment_note,
                )
                # BUG-R6-01: Allow up to 2 regeneration attempts with re-check.
                _max_regen = 2
                for _regen_attempt in range(1, _max_regen + 1):
                    logger.info(
                        "Stage 10: Alignment regen attempt %d/%d",
                        _regen_attempt, _max_regen,
                    )
                    regen_prompt = (
                        f"The experiment code you previously generated does NOT align "
                        f"with the research topic.\n\n"
                        f"TOPIC: {config.research.topic}\n"
                        f"MISALIGNMENT: {alignment_note}\n"
                        f"SUGGESTIONS: {suggestions}\n\n"
                        f"REGENERATE the experiment code to DIRECTLY test the stated "
                        f"topic. The code MUST implement the core technique described "
                        f"in the topic, not a generic proxy.\n\n"
                        f"CRITICAL CONSTRAINTS:\n"
                        f"- You MUST implement the EXACT algorithm/method from the topic.\n"
                        f"- Do NOT substitute a deep-learning proxy (ResNet, BERT, etc.) "
                        f"when the topic describes a tabular, bandit, or game-theoretic method.\n"
                        f"- Use ONLY lightweight CPU-friendly libraries (numpy, scipy, "
                        f"sklearn) unless the topic EXPLICITLY requires deep learning.\n"
                        f"- The experiment must be self-contained and runnable without GPU.\n\n"
                        f"{pkg_hint}\n{compute_budget}\n"
                        f"PLAN:\n{exp_plan}\n\n"
                        f"Return multiple files using ```filename:xxx.py format."
                    )
                    regen_resp = _chat_with_prompt(
                        llm,
                        system=_pm.system("code_generation"),
                        user=regen_prompt,
                        max_tokens=_code_max_tokens,
                    )
                    regen_files = _extract_multi_file_blocks(regen_resp.content)
                    if not regen_files or "main.py" not in regen_files:
                        logger.warning(
                            "Stage 10: Regen attempt %d produced no main.py",
                            _regen_attempt,
                        )
                        continue
                    files = regen_files
                    for fname, code in files.items():
                        (exp_dir / fname).write_text(code, encoding="utf-8")
                    # Re-check alignment on regenerated code (BUG-171 fix)
                    _rc_inv = []
                    for _fn, _cd in files.items():
                        _rc_inv.append(f"  {_fn}: {_cd.count(chr(10))+1} lines")
                    _rc_main = files.get("main.py", "")
                    if len(_rc_main) > 12000:
                        _rc_main = _rc_main[:12000] + "\n... [truncated]"
                    _rc_sigs = []
                    for _fn, _cd in files.items():
                        if _fn == "main.py":
                            continue
                        # BUG-209: Include imports alongside signatures
                        _slines = [l for l in _cd.split("\n")
                                   if l.strip().startswith((
                                       "def ", "class ", "async def ",
                                       "import ", "from ",
                                   ))]
                        if _slines:
                            _rc_sigs.append(f"# {_fn} imports+signatures:\n" + "\n".join(_slines))
                    recheck_code = (
                        "FILES:\n" + "\n".join(_rc_inv) + "\n\n"
                        f"# main.py (FULL):\n{_rc_main}\n\n"
                        + "\n".join(_rc_sigs)
                    )
                    recheck_resp = llm.chat(
                        [{"role": "user", "content": (
                            f"Research topic: {config.research.topic}\n\n"
                            f"Experiment code:\n```python\n{recheck_code}\n```\n\n"
                            "TASK: Evaluate whether this experiment code actually tests "
                            "the stated research topic. Only main.py is shown in full; "
                            "other files show signatures only. Answer with JSON:\n"
                            '{"aligned": true/false, "reason": "...", "suggestions": "..."}\n'
                        )}],
                        system="You are a scientific code reviewer checking topic-experiment alignment.",
                        max_tokens=1024,
                    )
                    recheck_data = _safe_json_loads(recheck_resp.content, {})
                    if isinstance(recheck_data, dict) and recheck_data.get("aligned", False):
                        alignment_ok = True
                        alignment_note = f"Regenerated after alignment check (attempt {_regen_attempt})"
                        logger.info(
                            "Stage 10: Code aligned after regen attempt %d",
                            _regen_attempt,
                        )
                        break
                    else:
                        alignment_note = recheck_data.get("reason", alignment_note)
                        suggestions = recheck_data.get("suggestions", suggestions)
                        logger.warning(
                            "Stage 10: Regen attempt %d still misaligned: %s",
                            _regen_attempt, alignment_note,
                        )
        except Exception as exc:
            logger.debug("Alignment check failed: %s", exc)

    # --- FIX-7: Ablation distinctness check ---
    main_code = files.get("main.py", "")
    if llm is not None and main_code and "condition" in main_code.lower():
        try:
            ablation_prompt = (
                f"Examine this experiment code:\n```python\n{main_code[:6000]}\n```\n\n"
                "Check if any experimental conditions (methods/ablations) have "
                "IDENTICAL configurations (same hyperparameters, same code paths). "
                "Answer JSON: "
                '{"has_duplicates": true/false, "details": "which conditions are identical"}'
            )
            abl_resp = llm.chat(
                [{"role": "user", "content": ablation_prompt}],
                system="You are a code reviewer checking experimental conditions.",
                max_tokens=512,
            )
            abl_data = _safe_json_loads(abl_resp.content, {})
            if isinstance(abl_data, dict) and abl_data.get("has_duplicates"):
                logger.warning(
                    "Stage 10: Duplicate ablation conditions detected: %s",
                    abl_data.get("details", ""),
                )
                (stage_dir / "ablation_warning.json").write_text(
                    json.dumps(abl_data, indent=2), encoding="utf-8"
                )
                # --- Attempt ablation repair ---
                all_code_ctx = "\n\n".join(
                    f"```filename:{f}\n{c}\n```" for f, c in files.items()
                )
                dup_details = abl_data.get("details", "unknown")
                abl_repair_prompt = (
                    f"ABLATION REPAIR REQUIRED — duplicate conditions detected:\n"
                    f"{dup_details}\n\n"
                    f"Rewrite the ablation/variant conditions so each one is "
                    f"GENUINELY DIFFERENT. Concrete strategies:\n"
                    f"- 'no_<component>': REMOVE the component entirely "
                    f"(e.g., replace attention with mean pooling, remove a loss term)\n"
                    f"- 'reduced_capacity': HALVE hidden dimensions or layers\n"
                    f"- Different conditions MUST produce different outputs on the "
                    f"same input. Add a startup assertion that runs one forward pass "
                    f"per condition on identical input and prints:\n"
                    f"  ABLATION_CHECK: <cond1> vs <cond2> outputs_differ=True\n\n"
                    f"Return ALL files using ```filename:xxx.py format.\n\n"
                    f"Current code:\n{all_code_ctx}\n"
                )
                try:
                    abl_repair_resp = _chat_with_prompt(
                        llm,
                        _pm.system("code_generation"),
                        abl_repair_prompt,
                        max_tokens=_code_max_tokens,
                    )
                    repaired_files = _extract_multi_file_blocks(
                        abl_repair_resp.content
                    )
                    if repaired_files and "main.py" in repaired_files:
                        files = repaired_files
                        for fname, code in files.items():
                            (exp_dir / fname).write_text(code, encoding="utf-8")
                        logger.info(
                            "Stage 10: Ablation repair applied — "
                            "rewrote duplicate conditions"
                        )
                except Exception as exc:
                    logger.debug("Ablation repair failed: %s", exc)
        except Exception as exc:
            logger.debug("Ablation validation skipped: %s", exc)

    # --- Write spec ---
    file_list = ", ".join(f"`{f}`" for f in sorted(files.keys()))
    main_validation = validate_code(files.get("main.py", ""))
    _align_status = "ALIGNED" if alignment_ok else f"MISALIGNED: {alignment_note}"
    spec = f"""# Experiment Specification

## Topic
{config.research.topic}

## Project Structure
Multi-file experiment project with {len(files)} file(s): {file_list}

## Entry Point
`main.py` \u2014 executed directly via sandbox

## Outputs
- `main.py` emits metric lines in `name: value` format
- Primary metric key: `{metric}`

## Topic-Experiment Alignment
{_align_status}

## Constraints
- Time budget per run: {config.experiment.time_budget_sec}s
- Max iterations: {config.experiment.max_iterations}
- Self-contained execution (no external data, no network)
- Validated: {main_validation.summary()}

## Generated
{_utcnow_iso()}
"""
    (stage_dir / "experiment_spec.md").write_text(spec, encoding="utf-8")

    artifacts = ["experiment/", "experiment_spec.md"]
    if (stage_dir / "validation_report.md").exists():
        artifacts.append("validation_report.md")

    # BUG-R6-01: Fail stage if alignment check detected persistent mismatch
    # after all regen attempts, instead of silently proceeding.
    if not alignment_ok:
        logger.error(
            "Stage 10: Persistent topic-experiment misalignment after all "
            "regen attempts. Failing stage. Reason: %s",
            alignment_note,
        )
        return StageResult(
            stage=Stage.CODE_GENERATION,
            status=StageStatus.FAILED,
            artifacts=tuple(artifacts),
            evidence_refs=tuple(f"stage-10/{a}" for a in artifacts),
            error=f"Topic-experiment misalignment: {alignment_note}",
        )

    return StageResult(
        stage=Stage.CODE_GENERATION,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-10/{a}" for a in artifacts),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_execution.py
================================================
"""Stages 11-13: Resource planning, experiment execution, and iterative refinement."""

from __future__ import annotations

import json
import logging
import math
import re
import time as _time
from pathlib import Path
from typing import Any

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.experiment.validator import (
    CodeValidation,
    format_issues_for_llm,
    validate_code,
)
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _chat_with_prompt,
    _detect_runtime_issues,
    _ensure_sandbox_deps,
    _extract_code_block,
    _extract_multi_file_blocks,
    _get_evolution_overlay,
    _load_hardware_profile,
    _parse_metrics_from_stdout,
    _read_prior_artifact,
    _safe_filename,
    _safe_json_loads,
    _utcnow_iso,
    _write_stage_meta,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_resource_planning(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or ""
    schedule: dict[str, Any] | None = None
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "resource_planning")
        sp = _pm.for_stage("resource_planning", evolution_overlay=_overlay, exp_plan=exp_plan)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        parsed = _safe_json_loads(resp.content, {})
        if isinstance(parsed, dict):
            schedule = parsed
    if schedule is None:
        schedule = {
            "tasks": [
                {
                    "id": "baseline",
                    "name": "Run baseline",
                    "depends_on": [],
                    "gpu_count": 1,
                    "estimated_minutes": 20,
                    "priority": "high",
                },
                {
                    "id": "proposed",
                    "name": "Run proposed method",
                    "depends_on": ["baseline"],
                    "gpu_count": 1,
                    "estimated_minutes": 30,
                    "priority": "high",
                },
            ],
            "total_gpu_budget": 1,
            "generated": _utcnow_iso(),
        }
    schedule.setdefault("generated", _utcnow_iso())
    (stage_dir / "schedule.json").write_text(
        json.dumps(schedule, indent=2), encoding="utf-8"
    )
    return StageResult(
        stage=Stage.RESOURCE_PLANNING,
        status=StageStatus.DONE,
        artifacts=("schedule.json",),
        evidence_refs=("stage-11/schedule.json",),
    )


def _execute_experiment_run(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    from researchclaw.experiment.factory import create_sandbox
    from researchclaw.experiment.runner import ExperimentRunner

    schedule_text = _read_prior_artifact(run_dir, "schedule.json") or "{}"
    # Try multi-file experiment directory first, fall back to single file
    exp_dir_path = _read_prior_artifact(run_dir, "experiment/")
    code_text = ""
    if exp_dir_path and Path(exp_dir_path).is_dir():
        main_path = Path(exp_dir_path) / "main.py"
        if main_path.exists():
            code_text = main_path.read_text(encoding="utf-8")
    if not code_text:
        code_text = _read_prior_artifact(run_dir, "experiment.py") or ""

    runs_dir = stage_dir / "runs"
    runs_dir.mkdir(parents=True, exist_ok=True)
    mode = config.experiment.mode
    if mode in ("sandbox", "docker"):
        # P7: Auto-install missing dependencies before subprocess sandbox
        if mode == "sandbox":
            _all_code = code_text
            if exp_dir_path and Path(exp_dir_path).is_dir():
                for _pyf in Path(exp_dir_path).glob("*.py"):
                    try:
                        _all_code += "\n" + _pyf.read_text(encoding="utf-8")
                    except (OSError, UnicodeDecodeError):
                        pass
            _ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path)

        sandbox = create_sandbox(config.experiment, runs_dir / "sandbox")
        # Use run_project for multi-file, run for single-file
        if exp_dir_path and Path(exp_dir_path).is_dir():
            result = sandbox.run_project(
                Path(exp_dir_path), timeout_sec=config.experiment.time_budget_sec
            )
        else:
            result = sandbox.run(
                code_text, timeout_sec=config.experiment.time_budget_sec
            )
        # Try to read structured results.json from sandbox working dir
        structured_results: dict[str, Any] | None = None
        sandbox_project = runs_dir / "sandbox" / "_project"
        results_json_path = sandbox_project / "results.json"
        if results_json_path.exists():
            try:
                structured_results = json.loads(
                    results_json_path.read_text(encoding="utf-8")
                )
                # Copy results.json to runs dir for easy access
                (runs_dir / "results.json").write_text(
                    results_json_path.read_text(encoding="utf-8"),
                    encoding="utf-8",
                )
            except (json.JSONDecodeError, OSError):
                structured_results = None

        # If sandbox metrics are empty, try to parse from stdout
        effective_metrics = result.metrics
        if not effective_metrics and result.stdout:
            effective_metrics = _parse_metrics_from_stdout(result.stdout)

        # Determine run status: completed / partial (timed out with data) / failed
        # R6-2: Detect stdout failure signals even when exit code is 0
        _stdout_has_failure = bool(
            result.stdout
            and not effective_metrics
            and any(
                sig in result.stdout
                for sig in ("FAIL:", "NaN/divergence", "Traceback (most recent")
            )
        )
        if result.returncode == 0 and not result.timed_out and not _stdout_has_failure:
            run_status = "completed"
        elif result.timed_out and effective_metrics:
            run_status = "partial"
            logger.warning(
                "Experiment timed out but captured %d partial metrics",
                len(effective_metrics),
            )
        else:
            run_status = "failed"
            if _stdout_has_failure:
                logger.warning(
                    "Experiment exited cleanly but stdout contains failure signals"
                )

        # P1: Warn if experiment completed suspiciously fast (trivially easy benchmark)
        if run_status == "completed" and result.elapsed_sec and result.elapsed_sec < 5.0:
            logger.warning(
                "Stage 12: Experiment completed in %.2fs — benchmark may be trivially easy. "
                "Consider increasing task difficulty.",
                result.elapsed_sec,
            )

        run_payload: dict[str, Any] = {
            "run_id": "run-1",
            "task_id": "sandbox-main",
            "status": run_status,
            "metrics": effective_metrics,
            "elapsed_sec": result.elapsed_sec,
            "stdout": result.stdout,
            "stderr": result.stderr,
            "timed_out": result.timed_out,
            "completed_at": _utcnow_iso(),
        }
        if structured_results is not None:
            run_payload["structured_results"] = structured_results
        # Auto-generate results.json from parsed metrics if sandbox didn't produce one
        if structured_results is None and effective_metrics:
            auto_results = {"source": "stdout_parsed", "metrics": effective_metrics}
            (runs_dir / "results.json").write_text(
                json.dumps(auto_results, indent=2), encoding="utf-8"
            )
            logger.info("Stage 12: Auto-generated results.json from stdout metrics (%d keys)", len(effective_metrics))
        (runs_dir / "run-1.json").write_text(
            json.dumps(run_payload, indent=2), encoding="utf-8"
        )

        # R11-6: Time budget adequacy check
        if result.timed_out or (result.elapsed_sec and result.elapsed_sec > config.experiment.time_budget_sec * 0.9):
            # Parse stdout to estimate how many conditions/seeds completed
            _stdout = result.stdout or ""
            _completed_conditions = set()
            _completed_seeds = 0
            for _line in _stdout.splitlines():
                if "condition=" in _line and "seed=" in _line:
                    _completed_seeds += 1
                    _cond_match = re.match(r".*condition=(\S+)", _line)
                    if _cond_match:
                        _completed_conditions.add(_cond_match.group(1))
            _time_budget_warning = {
                "timed_out": result.timed_out,
                "elapsed_sec": result.elapsed_sec,
                "budget_sec": config.experiment.time_budget_sec,
                "conditions_completed": sorted(_completed_conditions),
                "total_seed_runs": _completed_seeds,
                "warning": (
                    f"Experiment used {result.elapsed_sec:.0f}s of "
                    f"{config.experiment.time_budget_sec}s budget. "
                    f"Only {len(_completed_conditions)} conditions completed "
                    f"({_completed_seeds} seed-runs). Consider increasing "
                    f"time_budget_sec for more complete results."
                ),
            }
            logger.warning(
                "Stage 12: %s", _time_budget_warning["warning"]
            )
            (stage_dir / "time_budget_warning.json").write_text(
                json.dumps(_time_budget_warning, indent=2), encoding="utf-8"
            )

        # FIX-8: Validate seed count from structured results
        if structured_results and isinstance(structured_results, dict):
            _sr_conditions = structured_results.get("conditions", structured_results.get("per_condition", {}))
            if isinstance(_sr_conditions, dict):
                for _cname, _cdata in _sr_conditions.items():
                    if isinstance(_cdata, dict):
                        _seeds_run = _cdata.get("seeds_run", _cdata.get("n_seeds", 0))
                        if isinstance(_seeds_run, (int, float)) and 0 < _seeds_run < 3:
                            logger.warning(
                                "Stage 12: Condition '%s' ran only %d seed(s) — "
                                "minimum 3 required for statistical validity",
                                _cname, int(_seeds_run),
                            )

    elif mode == "simulated":
        schedule = _safe_json_loads(schedule_text, {})
        tasks = schedule.get("tasks", []) if isinstance(schedule, dict) else []
        if not isinstance(tasks, list):
            tasks = []
        for idx, task in enumerate(tasks or [{"id": "task-1", "name": "simulated"}]):
            task_id = (
                str(task.get("id", f"task-{idx + 1}"))
                if isinstance(task, dict)
                else f"task-{idx + 1}"
            )
            payload = {
                "run_id": f"run-{idx + 1}",
                "task_id": task_id,
                "status": "simulated",
                "key_metrics": {
                    config.experiment.metric_key: round(0.3 + idx * 0.03, 4),
                    "secondary_metric": round(0.6 - idx * 0.04, 4),
                },
                "notes": "Simulated run result",
                "completed_at": _utcnow_iso(),
            }
            run_id = str(payload["run_id"])
            (runs_dir / f"{_safe_filename(run_id)}.json").write_text(
                json.dumps(payload, indent=2), encoding="utf-8"
            )
    else:
        runner = ExperimentRunner(config.experiment, runs_dir / "workspace")
        history = runner.run_loop(code_text, run_id=f"exp-{run_dir.name}", llm=llm)
        runner.save_history(stage_dir / "experiment_history.json")
        for item in history.results:
            payload = {
                "run_id": f"run-{item.iteration}",
                "task_id": item.run_id,
                "status": "completed" if item.error is None else "failed",
                "metrics": item.metrics,
                "primary_metric": item.primary_metric,
                "improved": item.improved,
                "kept": item.kept,
                "elapsed_sec": item.elapsed_sec,
                "error": item.error,
                "completed_at": _utcnow_iso(),
            }
            run_id = str(payload["run_id"])
            (runs_dir / f"{_safe_filename(run_id)}.json").write_text(
                json.dumps(payload, indent=2), encoding="utf-8"
            )
    return StageResult(
        stage=Stage.EXPERIMENT_RUN,
        status=StageStatus.DONE,
        artifacts=("runs/",),
        evidence_refs=("stage-12/runs/",),
    )


def _execute_iterative_refine(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    from researchclaw.experiment.factory import create_sandbox
    from researchclaw.experiment.validator import format_issues_for_llm, validate_code

    def _to_float(value: Any) -> float | None:
        try:
            if value is None:
                return None
            f = float(value)
            # BUG-EX-01: NaN/Inf block all future improvement detection
            if math.isnan(f) or math.isinf(f):
                return None
            return f
        except (TypeError, ValueError):
            return None

    # R10-Fix3: Skip iterative refinement in simulated mode (no real execution)
    if config.experiment.mode == "simulated":
        logger.info(
            "Stage 13: Skipping iterative refinement in simulated mode "
            "(no real code execution available)"
        )
        import shutil

        final_dir = stage_dir / "experiment_final"
        # Copy latest experiment code as final (directory or single file)
        copied = False
        for stage_num in (12, 10):
            src_dir = run_dir / f"stage-{stage_num:02d}" / "experiment"
            if src_dir.is_dir():
                if final_dir.exists():
                    shutil.rmtree(final_dir)
                shutil.copytree(src_dir, final_dir)
                copied = True
                break
            # Also check for single experiment.py
            src_file = run_dir / f"stage-{stage_num:02d}" / "experiment.py"
            if src_file.is_file():
                (stage_dir / "experiment_final.py").write_text(
                    src_file.read_text(encoding="utf-8"), encoding="utf-8"
                )
                copied = True
                break

        log: dict[str, Any] = {
            "generated": _utcnow_iso(),
            "mode": "simulated",
            "skipped": True,
            "skip_reason": "Iterative refinement not meaningful in simulated mode",
            "metric_key": config.experiment.metric_key,
        }
        (stage_dir / "refinement_log.json").write_text(
            json.dumps(log, indent=2), encoding="utf-8"
        )
        return StageResult(
            stage=Stage.ITERATIVE_REFINE,
            status=StageStatus.DONE,
            artifacts=("refinement_log.json",),
            evidence_refs=(),
        )

    metric_key = config.experiment.metric_key
    metric_direction = config.experiment.metric_direction

    # P9: Detect metric direction mismatch between config and experiment code.
    # The code-gen stage instructs experiments to print a line like:
    #   METRIC_DEF: primary_metric | direction=higher | desc=...
    # Log a warning if mismatch is detected, but trust the config value
    # (BUG-06 fix: no longer auto-override, since Stage 9 and 12 now
    # explicitly enforce config.metric_direction in prompts).
    _runs_dir_detect = _read_prior_artifact(run_dir, "runs/")
    if _runs_dir_detect and Path(_runs_dir_detect).is_dir():
        import re as _re_detect

        for _rf in sorted(Path(_runs_dir_detect).glob("*.json"))[:5]:
            try:
                _rp = _safe_json_loads(_rf.read_text(encoding="utf-8"), {})
                _stdout = _rp.get("stdout", "") if isinstance(_rp, dict) else ""
                _match = _re_detect.search(
                    r"METRIC_DEF:.*direction\s*=\s*(higher|lower)", _stdout
                )
                if _match:
                    _detected = _match.group(1)
                    _detected_dir = "maximize" if _detected == "higher" else "minimize"
                    if _detected_dir != metric_direction:
                        logger.warning(
                            "P9: Metric direction mismatch — config says '%s' but "
                            "experiment code declares 'direction=%s'. "
                            "Keeping config value '%s'. Code will be "
                            "corrected in next refinement cycle.",
                            metric_direction,
                            _detected,
                            metric_direction,
                        )
                    break
            except OSError:
                pass

    maximize = metric_direction == "maximize"

    def _is_better(candidate: float | None, current: float | None) -> bool:
        if candidate is None:
            return False
        if current is None:
            return True
        return candidate > current if maximize else candidate < current

    def _find_metric(metrics: dict[str, object], key: str) -> float | None:
        """R13-4: Find metric value with fuzzy key matching.

        Tries exact match first, then looks for aggregate keys that contain
        the metric name (e.g. 'primary_metric_mean' when key='primary_metric').
        """
        # Exact match
        val = _to_float(metrics.get(key))
        if val is not None:
            return val
        # Try aggregate/mean keys containing the metric name
        # Prefer keys ending with the metric name or containing '_mean'
        candidates: list[tuple[str, float]] = []
        for mk, mv in metrics.items():
            fv = _to_float(mv)
            if fv is None:
                continue
            if mk == key or mk.endswith(f"/{key}"):
                return fv  # Exact match via condition prefix
            if key in mk and ("mean" in mk or "avg" in mk):
                candidates.append((mk, fv))
            elif mk.endswith(f"_{key}") or mk.endswith(f"/{key}_mean"):
                candidates.append((mk, fv))
        if candidates:
            # Take the aggregate mean if available, otherwise first match
            for ck, cv in candidates:
                if "mean" in ck:
                    return cv
            return candidates[0][1]
        # Last resort: if there's an "overall" or root-level aggregate
        for mk, mv in metrics.items():
            fv = _to_float(mv)
            if fv is not None and key in mk and "/" not in mk and "seed" not in mk:
                return fv
        return None

    requested_iterations = int(getattr(config.experiment, "max_iterations", 10) or 10)
    max_iterations = max(1, min(requested_iterations, 10))

    # BUG-57: Wall-clock time cap for the entire refinement stage.
    # Default: 3× the per-iteration time budget (e.g., 2400s → 7200s = 2h).
    import time as _time_bug57
    _refine_start_time = _time_bug57.monotonic()
    _per_iter_budget = int(getattr(config.experiment, "time_budget_sec", 2400) or 2400)
    _max_refine_wall_sec = int(
        getattr(config.experiment, "max_refine_duration_sec", 0) or 0
    ) or int(_per_iter_budget * 1.5)

    # --- Collect baseline metrics from prior runs ---
    runs_dir_path: Path | None = None
    runs_dir_text = _read_prior_artifact(run_dir, "runs/")
    if runs_dir_text:
        runs_dir_path = Path(runs_dir_text)

    run_summaries: list[str] = []
    baseline_metric: float | None = None
    if runs_dir_path is not None:
        for run_file in sorted(runs_dir_path.glob("*.json"))[:40]:
            payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {})
            if not isinstance(payload, dict):
                continue
            # R5-5: Truncate stdout/stderr for context efficiency
            summary = dict(payload)
            if "stdout" in summary and isinstance(summary["stdout"], str):
                lines = summary["stdout"].splitlines()
                if len(lines) > 30:
                    summary["stdout"] = (
                        f"[...truncated {len(lines) - 30} lines...]\n"
                        + "\n".join(lines[-30:])
                    )
                if len(summary["stdout"]) > 2000:
                    summary["stdout"] = summary["stdout"][-2000:]
            if "stderr" in summary and isinstance(summary["stderr"], str):
                lines = summary["stderr"].splitlines()
                if len(lines) > 50:
                    summary["stderr"] = "\n".join(lines[-50:])
                if len(summary["stderr"]) > 2000:
                    summary["stderr"] = summary["stderr"][-2000:]
            run_summaries.append(json.dumps(summary, ensure_ascii=False))
            metrics = payload.get("metrics")
            if not isinstance(metrics, dict):
                metrics = (
                    payload.get("key_metrics")
                    if isinstance(payload.get("key_metrics"), dict)
                    else {}
                )
            metric_val = (
                _find_metric(metrics, metric_key)
                if isinstance(metrics, dict)
                else None
            )
            if metric_val is None:
                metric_val = _to_float(payload.get("primary_metric"))
            if _is_better(metric_val, baseline_metric):
                baseline_metric = metric_val

    # --- Read experiment project (multi-file or single-file) ---
    # BUG-58: When PIVOT rolls back to Stage 13, prefer the best refined code
    # from a previous cycle (stage-13_vX/experiment_final/) over the original
    # unrefined code (stage-12/experiment/ or stage-10/experiment/).
    # Enhanced: try ALL versioned directories (latest first) with fallback chain.
    exp_dir_text: str | None = None
    _prev_refine_dirs = sorted(
        run_dir.glob("stage-13_v*/experiment_final"),
        key=lambda p: p.parent.name,
        reverse=True,  # latest version first
    )
    # BUG-58 fix: Find the best version across ALL cycles (not just latest)
    _best_prev_metric: float | None = None
    _best_prev_dir: Path | None = None
    for _prd in _prev_refine_dirs:
        if not _prd.is_dir():
            continue
        _prd_log = _prd.parent / "refinement_log.json"
        if _prd_log.is_file():
            _prd_data = _safe_json_loads(
                _prd_log.read_text(encoding="utf-8"), {}
            )
            _prd_metric = _prd_data.get("best_metric") if isinstance(_prd_data, dict) else None
            if isinstance(_prd_metric, (int, float)) and _is_better(_prd_metric, _best_prev_metric):
                _best_prev_metric = _prd_metric
                _best_prev_dir = _prd
        elif _best_prev_dir is None:
            # No log but directory exists — use as fallback
            _best_prev_dir = _prd
    if _best_prev_dir is not None:
        exp_dir_text = str(_best_prev_dir)
        logger.info(
            "BUG-58: Recovered best refined code from PIVOT cycle: %s (metric=%s)",
            _best_prev_dir.parent.name,
            f"{_best_prev_metric:.4f}" if _best_prev_metric is not None else "N/A",
        )
    if not exp_dir_text:
        exp_dir_text = _read_prior_artifact(run_dir, "experiment/")
    best_files: dict[str, str] = {}
    if exp_dir_text and Path(exp_dir_text).is_dir():
        # BUG-EX-02: Load ALL text files (not just .py) — requirements.txt,
        # setup.py, config files are needed for Docker sandbox phases.
        for src_file in sorted(Path(exp_dir_text).iterdir()):
            if src_file.is_file() and src_file.suffix in (
                ".py", ".txt", ".yaml", ".yml", ".json", ".cfg", ".ini", ".sh",
            ):
                try:
                    best_files[src_file.name] = src_file.read_text(encoding="utf-8")
                except UnicodeDecodeError:
                    pass  # skip binary files
    if not best_files:
        # Backward compat: single experiment.py
        original_code = _read_prior_artifact(run_dir, "experiment.py") or ""
        if original_code:
            best_files = {"main.py": original_code}

    # --- Detect if prior experiment timed out ---
    prior_timed_out = False
    prior_time_budget = config.experiment.time_budget_sec
    if runs_dir_path is not None:
        for run_file in sorted(runs_dir_path.glob("*.json"))[:5]:
            try:
                payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {})
                if isinstance(payload, dict) and payload.get("timed_out"):
                    prior_timed_out = True
                    break
            except OSError:
                pass

    best_metric = baseline_metric
    best_version = "experiment/"
    # BUG-58: Recover best_metric from best previous PIVOT cycle
    if _best_prev_metric is not None and _is_better(_best_prev_metric, best_metric):
        best_metric = _best_prev_metric
        logger.info(
            "BUG-58: Recovered best_metric=%.4f from previous PIVOT",
            best_metric,
        )
    no_improve_streak = 0
    consecutive_no_metrics = 0

    log: dict[str, Any] = {
        "generated": _utcnow_iso(),
        "mode": config.experiment.mode,
        "metric_key": metric_key,
        "metric_direction": metric_direction,
        "max_iterations_requested": requested_iterations,
        "max_iterations_executed": max_iterations,
        "baseline_metric": baseline_metric,
        "project_files": list(best_files.keys()),
        "iterations": [],
        "converged": False,
        "stop_reason": "max_iterations_reached",
    }

    # --- Helper: write files to a directory ---
    def _write_project(target_dir: Path, project_files: dict[str, str]) -> None:
        target_dir.mkdir(parents=True, exist_ok=True)
        for fname, code in project_files.items():
            (target_dir / fname).write_text(code, encoding="utf-8")

    # --- Helper: format all files for LLM context ---
    def _files_to_context(project_files: dict[str, str]) -> str:
        parts = []
        for fname, code in sorted(project_files.items()):
            parts.append(f"```filename:{fname}\n{code}\n```")
        return "\n\n".join(parts)

    if llm is None:
        logger.info("Stage 13: LLM unavailable, saving original experiment as final")
        final_dir = stage_dir / "experiment_final"
        _write_project(final_dir, best_files)
        # Backward compat
        if "main.py" in best_files:
            (stage_dir / "experiment_final.py").write_text(
                best_files["main.py"], encoding="utf-8"
            )
        log.update(
            {
                "converged": True,
                "stop_reason": "llm_unavailable",
                "best_metric": best_metric,
                "best_version": "experiment_final/",
                "iterations": [
                    {
                        "iteration": 0,
                        "version_dir": "experiment_final/",
                        "source": "fallback_original",
                        "metric": best_metric,
                    }
                ],
            }
        )
        (stage_dir / "refinement_log.json").write_text(
            json.dumps(log, indent=2), encoding="utf-8"
        )
        artifacts = ("refinement_log.json", "experiment_final/")
        return StageResult(
            stage=Stage.ITERATIVE_REFINE,
            status=StageStatus.DONE,
            artifacts=artifacts,
            evidence_refs=tuple(f"stage-13/{a}" for a in artifacts),
        )

    _pm = prompts or PromptManager()
    timeout_refine_attempts = 0

    # R7-3: Read experiment plan to detect condition coverage gaps
    _exp_plan_text = _read_prior_artifact(run_dir, "exp_plan.yaml") or ""
    _condition_coverage_hint = ""
    if _exp_plan_text and run_summaries:
        # Check if stdout contains condition labels
        _all_stdout = " ".join(run_summaries)
        _has_condition_labels = "condition=" in _all_stdout
        if not _has_condition_labels and _exp_plan_text.strip():
            _condition_coverage_hint = (
                "\nCONDITION COVERAGE GAP DETECTED:\n"
                "The experiment plan specifies multiple conditions/treatments, "
                "but the output contains NO condition labels (no 'condition=...' in stdout).\n"
                "You MUST:\n"
                "1. Run ALL conditions/treatments from the experiment plan independently\n"
                "2. Label each metric output: `condition=<name> {metric_key}: <value>`\n"
                "3. Print a SUMMARY line comparing all conditions after completion\n"
                "This is the MOST IMPORTANT improvement — a single unlabeled metric stream "
                "cannot support any comparative conclusions.\n\n"
            )
            logger.info(
                "Stage 13: condition coverage gap detected, injecting multi-condition hint"
            )

    # P1: Track metrics history for saturation detection
    _metrics_history: list[float | None] = [baseline_metric]

    for iteration in range(1, max_iterations + 1):
        # BUG-57: Check wall-clock time before starting a new iteration
        _elapsed = _time_bug57.monotonic() - _refine_start_time
        if _elapsed > _max_refine_wall_sec:
            logger.warning(
                "Stage 13: Wall-clock time cap reached (%.0fs > %ds). "
                "Stopping refinement after %d iterations.",
                _elapsed, _max_refine_wall_sec, iteration - 1,
            )
            log["stop_reason"] = "wall_clock_time_cap"
            break
        logger.info("Stage 13: refinement iteration %d/%d (%.0fs elapsed, cap %ds)",
                    iteration, max_iterations, _elapsed, _max_refine_wall_sec)

        # P1: Detect metric saturation and inject difficulty upgrade hint
        _saturation_hint = ""
        _valid_metrics = [m for m in _metrics_history if m is not None]
        if len(_valid_metrics) >= 2:
            _last_two = _valid_metrics[-2:]
            _saturated = False
            # Use relative change rate instead of hard-coded thresholds
            _change_rate = abs(_last_two[-1] - _last_two[-2]) / max(abs(_last_two[-2]), 1e-8)
            if metric_direction == "minimize":
                _saturated = all(m <= 0.001 for m in _last_two) or (
                    _change_rate < 0.001 and _last_two[-1] < 0.01
                )
            else:
                _saturated = all(m >= 0.999 for m in _last_two) or (
                    _change_rate < 0.001 and _last_two[-1] > 0.99
                )
            if _saturated:
                _saturation_hint = (
                    "\n\nWARNING — BENCHMARK SATURATION DETECTED:\n"
                    "All methods achieve near-perfect scores, making the task too easy "
                    "to discriminate between methods.\n"
                    "YOU MUST increase benchmark difficulty in this iteration:\n"
                    "1. Increase the number of actions/decisions from 8 to at least 20\n"
                    "2. Increase the horizon from 12-18 to at least 50-100 steps\n"
                    "3. Increase noise level to at least 0.3-0.5\n"
                    "4. Add partial observability (agent cannot see full state)\n"
                    "5. Add delayed rewards (reward only at episode end)\n"
                    "6. Ensure random search achieves < 50% success rate\n"
                    "Without this change, the experiment produces meaningless results.\n"
                )
                logger.warning("Stage 13: metric saturation detected, injecting difficulty upgrade hint")

        files_context = _files_to_context(best_files)
        # BUG-10 fix: anchor refinement to original experiment plan
        _exp_plan_anchor = ""
        if _exp_plan_text.strip():
            _exp_plan_anchor = (
                "Original experiment plan (exp_plan.yaml):\n"
                "```yaml\n" + _exp_plan_text[:4000] + "\n```\n"
                "You MUST preserve ALL condition names from this plan.\n\n"
            )
        ip = _pm.sub_prompt(
            "iterative_improve",
            metric_key=metric_key,
            metric_direction=metric_direction,
            files_context=files_context,
            run_summaries=chr(10).join(run_summaries[:20]),
            condition_coverage_hint=_condition_coverage_hint,
            topic=config.research.topic,
            exp_plan_anchor=_exp_plan_anchor,
        )

        # --- Timeout-aware prompt injection ---
        user_prompt = ip.user + _saturation_hint
        if prior_timed_out and baseline_metric is None:
            timeout_refine_attempts += 1
            timeout_hint = (
                f"\n\nCRITICAL: The experiment TIMED OUT after {prior_time_budget}s "
                f"with NO results. You MUST drastically reduce the experiment scale:\n"
                f"- Reduce total runs to ≤50\n"
                f"- Reduce steps per run to ≤2000\n"
                f"- Remove conditions that are not essential\n"
                f"- Add time.time() checks to stop gracefully before timeout\n"
                f"- Print intermediate metrics frequently so partial data is captured\n"
                f"- Time budget is {prior_time_budget}s — design for ≤{int(prior_time_budget * 0.7)}s\n"
            )
            user_prompt = user_prompt + timeout_hint
            logger.warning(
                "Stage 13: injecting timeout-aware prompt (attempt %d)",
                timeout_refine_attempts,
            )

        response = _chat_with_prompt(
            llm,
            ip.system,
            user_prompt,
            max_tokens=ip.max_tokens or 8192,
        )
        extracted_files = _extract_multi_file_blocks(response.content)
        # If LLM returns only single block, treat as main.py update
        if not extracted_files:
            single_code = _extract_code_block(response.content)
            if single_code.strip():
                extracted_files = {"main.py": single_code}
        # R8-2: Merge with best_files to preserve supporting modules
        # (e.g., graphs.py, game.py) that the LLM didn't rewrite
        candidate_files = dict(best_files)
        if extracted_files:
            candidate_files.update(extracted_files)
        # If LLM returned nothing at all, candidate_files == best_files (unchanged)

        # BUG-R6-02: Preserve entry point when LLM strips main() function.
        # The LLM often returns only class/function improvements without the
        # main() entry point, causing the script to exit with no output.
        _new_main = candidate_files.get("main.py", "")
        _old_main = best_files.get("main.py", "")
        if (
            _new_main
            and _old_main
            and "if __name__" not in _new_main
            and "if __name__" in _old_main
        ):
            # Extract the entry-point block from original main.py
            _ep_idx = _old_main.rfind("\ndef main(")
            if _ep_idx == -1:
                _ep_idx = _old_main.rfind("\nif __name__")
            if _ep_idx != -1:
                _entry_block = _old_main[_ep_idx:]
                candidate_files["main.py"] = _new_main.rstrip() + "\n\n" + _entry_block
                logger.info(
                    "Stage 13 iter %d: restored entry point stripped by LLM "
                    "(%d chars appended from original main.py)",
                    iteration,
                    len(_entry_block),
                )

        # Validate main.py
        main_code = candidate_files.get("main.py", "")
        validation = validate_code(main_code)
        issue_text = ""
        repaired = False

        if not validation.ok:
            issue_text = format_issues_for_llm(validation)
            logger.info(
                "Stage 13 iteration %d validation failed: %s",
                iteration,
                validation.summary(),
            )
            irp = _pm.sub_prompt(
                "iterative_repair",
                issue_text=issue_text,
                all_files_ctx=_files_to_context(candidate_files),
            )
            repair_response = _chat_with_prompt(llm, irp.system, irp.user)
            candidate_files["main.py"] = _extract_code_block(repair_response.content)
            validation = validate_code(candidate_files["main.py"])
            repaired = True

        # Save version directory
        version_dir = stage_dir / f"experiment_v{iteration}"
        _write_project(version_dir, candidate_files)

        iter_record: dict[str, Any] = {
            "iteration": iteration,
            "version_dir": f"experiment_v{iteration}/",
            "files": list(candidate_files.keys()),
            "validation_ok": validation.ok,
            "validation_summary": validation.summary(),
            "repaired": repaired,
            "metric": None,
            "improved": False,
        }
        if issue_text:
            iter_record["validation_issues"] = issue_text

        metric_val = None  # R6-3: initialize before conditional block
        if validation.ok and config.experiment.mode in ("sandbox", "docker"):
            # P7: Ensure deps for refined code (subprocess sandbox only)
            if config.experiment.mode == "sandbox":
                _refine_code = "\n".join(candidate_files.values())
                _ensure_sandbox_deps(_refine_code, config.experiment.sandbox.python_path)

            sandbox = create_sandbox(
                config.experiment,
                stage_dir / f"refine_sandbox_v{iteration}",
            )
            rerun = sandbox.run_project(
                version_dir,
                timeout_sec=config.experiment.time_budget_sec,
            )
            metric_val = _find_metric(rerun.metrics, metric_key)
            # R19-1: Store stdout (capped) so PAIRED lines survive for Stage 14
            _stdout_cap = rerun.stdout[:50000] if rerun.stdout else ""
            iter_record["sandbox"] = {
                "returncode": rerun.returncode,
                "metrics": rerun.metrics,
                "elapsed_sec": rerun.elapsed_sec,
                "timed_out": rerun.timed_out,
                "stderr": rerun.stderr[:2000] if rerun.stderr else "",
                "stdout": _stdout_cap,
            }
            iter_record["metric"] = metric_val

            # BUG-110: Parse ABLATION_CHECK lines from stdout
            if rerun.stdout:
                import re as _re_ablation
                _ablation_checks = _re_ablation.findall(
                    r"ABLATION_CHECK:\s*(\S+)\s+vs\s+(\S+)\s+outputs_differ=(True|False)",
                    rerun.stdout,
                )
                if _ablation_checks:
                    _identical_pairs = [
                        (c1, c2) for c1, c2, diff in _ablation_checks if diff == "False"
                    ]
                    iter_record["ablation_checks"] = [
                        {"cond1": c1, "cond2": c2, "differ": diff == "True"}
                        for c1, c2, diff in _ablation_checks
                    ]
                    if _identical_pairs:
                        _pairs_str = ", ".join(f"{c1} vs {c2}" for c1, c2 in _identical_pairs)
                        logger.warning(
                            "BUG-110: Identical ablation outputs detected: %s. "
                            "Ablation conditions may not be wired correctly.",
                            _pairs_str,
                        )
                        iter_record["ablation_identical"] = True

            # --- Track timeout in refine sandbox ---
            if rerun.timed_out:
                prior_timed_out = True
                timeout_refine_attempts += 1
                logger.warning(
                    "Stage 13 iteration %d: sandbox timed out after %.1fs",
                    iteration,
                    rerun.elapsed_sec,
                )
                # If still no metrics after timeout, use partial stdout metrics
                if not rerun.metrics and rerun.stdout:
                    from researchclaw.experiment.sandbox import parse_metrics as _parse_sb_metrics
                    partial = _parse_sb_metrics(rerun.stdout)
                    if partial:
                        iter_record["sandbox"]["metrics"] = partial
                        metric_val = _find_metric(partial, metric_key)
                        iter_record["metric"] = metric_val
                        logger.info(
                            "Stage 13 iteration %d: recovered %d partial metrics from timeout stdout",
                            iteration,
                            len(partial),
                        )

            # --- Detect runtime issues (NaN/Inf, stderr warnings) ---
            runtime_issues = _detect_runtime_issues(rerun)
            if runtime_issues:
                iter_record["runtime_issues"] = runtime_issues
                logger.info(
                    "Stage 13 iteration %d: runtime issues detected: %s",
                    iteration,
                    runtime_issues[:200],
                )
                # Attempt LLM repair with runtime context
                rrp = _pm.sub_prompt(
                    "iterative_repair",
                    issue_text=runtime_issues,
                    all_files_ctx=_files_to_context(candidate_files),
                )
                repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user)
                repaired_files = _extract_multi_file_blocks(repair_resp.content)
                if not repaired_files:
                    single = _extract_code_block(repair_resp.content)
                    if single.strip():
                        repaired_files = dict(candidate_files)
                        repaired_files["main.py"] = single
                if repaired_files:
                    # BUG-106 fix: merge instead of replace to preserve
                    # supporting modules (trainers.py, utils.py, etc.)
                    merged = dict(candidate_files)
                    merged.update(repaired_files)
                    candidate_files = merged
                    _write_project(version_dir, candidate_files)
                    # Re-run after runtime fix
                    sandbox2 = create_sandbox(
                        config.experiment,
                        stage_dir / f"refine_sandbox_v{iteration}_fix",
                    )
                    rerun2 = sandbox2.run_project(
                        version_dir,
                        timeout_sec=config.experiment.time_budget_sec,
                    )
                    metric_val = _find_metric(rerun2.metrics, metric_key)
                    iter_record["sandbox_after_fix"] = {
                        "returncode": rerun2.returncode,
                        "metrics": rerun2.metrics,
                        "elapsed_sec": rerun2.elapsed_sec,
                        "timed_out": rerun2.timed_out,
                    }
                    iter_record["metric"] = metric_val
                    iter_record["runtime_repaired"] = True

            if metric_val is not None:
                consecutive_no_metrics = 0
                # R6-1: Only count toward no_improve_streak when we have real metrics
                if _is_better(metric_val, best_metric):
                    best_metric = metric_val
                    best_files = dict(candidate_files)
                    best_version = f"experiment_v{iteration}/"
                    iter_record["improved"] = True
                    no_improve_streak = 0
                else:
                    no_improve_streak += 1
            else:
                consecutive_no_metrics += 1
        elif validation.ok and best_version == "experiment/":
            best_files = dict(candidate_files)
            best_version = f"experiment_v{iteration}/"

        # P1: Track metric for saturation detection
        _metrics_history.append(metric_val)

        log["iterations"].append(iter_record)

        if consecutive_no_metrics >= 3:
            log["stop_reason"] = "consecutive_no_metrics"
            logger.warning("Stage 13: Aborting after %d consecutive iterations without metrics", consecutive_no_metrics)
            break

        if no_improve_streak >= 2:
            log["converged"] = True
            log["stop_reason"] = "no_improvement_for_2_iterations"
            logger.info(
                "Stage 13 converged after %d iterations (no improvement streak=%d)",
                iteration,
                no_improve_streak,
            )
            break

    # Write final experiment directory
    final_dir = stage_dir / "experiment_final"
    _write_project(final_dir, best_files)
    # Backward compat: also write experiment_final.py (copy of main.py)
    if "main.py" in best_files:
        (stage_dir / "experiment_final.py").write_text(
            best_files["main.py"], encoding="utf-8"
        )

    log["best_metric"] = best_metric
    log["best_version"] = best_version
    log["final_version"] = "experiment_final/"
    # BUG-110: Aggregate ablation check results across iterations
    _all_ablation_identical = any(
        iter_rec.get("ablation_identical", False)
        for iter_rec in log.get("iterations", [])
        if isinstance(iter_rec, dict)
    )
    if _all_ablation_identical:
        log["ablation_identical_warning"] = True
    (stage_dir / "refinement_log.json").write_text(
        json.dumps(log, indent=2), encoding="utf-8"
    )

    artifacts = ["refinement_log.json", "experiment_final/"]
    artifacts.extend(
        entry["version_dir"]
        for entry in log["iterations"]
        if isinstance(entry, dict) and isinstance(entry.get("version_dir"), str)
    )
    return StageResult(
        stage=Stage.ITERATIVE_REFINE,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-13/{a}" for a in artifacts),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_experiment_design.py
================================================
"""Stage 9: Experiment design."""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

import yaml

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _build_context_preamble,
    _chat_with_prompt,
    _extract_yaml_block,
    _get_evolution_overlay,
    _load_hardware_profile,
    _read_prior_artifact,
    _safe_json_loads,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_experiment_design(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    hypotheses = _read_prior_artifact(run_dir, "hypotheses.md") or ""
    preamble = _build_context_preamble(
        config, run_dir, include_goal=True, include_hypotheses=True
    )
    plan: dict[str, Any] | None = None

    # ── Domain detection ──────────────────────────────────────────────────
    # Detect the research domain early so we can adapt experiment design
    # and code generation. For ML domains, existing behavior is unchanged.
    _domain_profile = None
    try:
        from researchclaw.domains.detector import detect_domain as _detect_domain_adv
        _domain_profile = _detect_domain_adv(
            topic=config.research.topic,
            hypotheses=hypotheses,
        )
        logger.info(
            "Domain detected: %s (%s)",
            _domain_profile.display_name,
            _domain_profile.domain_id,
        )
        # Persist domain profile for Stage 10
        import json as _json_dd
        (stage_dir / "domain_profile.json").write_text(
            _json_dd.dumps({
                "domain_id": _domain_profile.domain_id,
                "display_name": _domain_profile.display_name,
                "experiment_paradigm": _domain_profile.experiment_paradigm,
                "core_libraries": _domain_profile.core_libraries,
                "gpu_required": _domain_profile.gpu_required,
            }, indent=2),
            encoding="utf-8",
        )
    except Exception:  # noqa: BLE001
        logger.debug("Domain detection unavailable", exc_info=True)
    if llm is not None:
        _pm = prompts or PromptManager()
        # Pass dataset_guidance block for experiment design
        try:
            _dg_block = _pm.block("dataset_guidance")
        except (KeyError, Exception):  # noqa: BLE001
            _dg_block = ""
        # I-08: Inject RL step guidance for RL topics
        _rl_kws = ("reinforcement learning", "ppo", "sac", "td3", "ddpg",
                    "dqn", "mujoco", "continuous control", "actor-critic",
                    "policy gradient", "exploration bonus")
        _is_rl_topic = any(kw in config.research.topic.lower() for kw in _rl_kws)
        if _is_rl_topic:
            try:
                _dg_block += _pm.block("rl_step_guidance")
            except Exception:  # noqa: BLE001
                pass
            # Improvement G: For RL with short budget, constrain to classic control
            if config.experiment.time_budget_sec <= 3600:
                _dg_block += (
                    "\n\n## RL TIME CONSTRAINT (MANDATORY):\n"
                    f"Your time budget is {config.experiment.time_budget_sec}s (≤ 3600s).\n"
                    "You MUST use ONLY classic control environments: "
                    "CartPole-v1, Pendulum-v1, MountainCar-v0, Acrobot-v1, LunarLander-v3.\n"
                    "Do NOT use MuJoCo (HalfCheetah, Hopper, Walker2d, Ant, Humanoid) — "
                    "they require >5000s for meaningful training.\n"
                )
            if config.experiment.time_budget_sec <= 1800:
                _dg_block += (
                    "Time budget ≤ 1800s: use ONLY CartPole-v1 or Pendulum-v1 "
                    "(the simplest environments).\n"
                )
        # F-01: Inject framework docs for experiment design
        try:
            from researchclaw.data import detect_frameworks, load_framework_docs
            _fw_ids = detect_frameworks(config.research.topic, hypotheses)
            if _fw_ids:
                _fw_docs = load_framework_docs(_fw_ids, max_chars=4000)
                if _fw_docs:
                    _dg_block += _fw_docs
        except Exception:  # noqa: BLE001
            pass
        # Improvement A: Compute hardware profile + per-condition budget
        _hw_profile_str = (
            "- GPU: NVIDIA RTX 6000 Ada (49140 MB VRAM)\n"
            "- GPU count: 1\n"
            "- CPU: shared server"
        )
        _per_condition_sec = int(config.experiment.time_budget_sec * 0.7 / 6)
        _tier1 = "CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN"

        _overlay = _get_evolution_overlay(run_dir, "experiment_design")
        sp = _pm.for_stage(
            "experiment_design",
            evolution_overlay=_overlay,
            preamble=preamble,
            hypotheses=hypotheses,
            dataset_guidance=_dg_block,
            time_budget_sec=config.experiment.time_budget_sec,
            metric_key=config.experiment.metric_key,
            metric_direction=config.experiment.metric_direction,
            hardware_profile=_hw_profile_str,
            per_condition_budget_sec=_per_condition_sec,
            available_tier1_datasets=_tier1,
        )
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        raw_yaml = _extract_yaml_block(resp.content)
        try:
            parsed = yaml.safe_load(raw_yaml)
        except yaml.YAMLError:
            parsed = None
        # Fallback: reasoning models sometimes emit the YAML without fences
        # or wrapped in prose. Try parsing the whole response as YAML.
        if not isinstance(parsed, dict):
            try:
                parsed = yaml.safe_load(resp.content)
            except yaml.YAMLError:
                pass
        # Last fallback: try to find any YAML-like dict in the response
        if not isinstance(parsed, dict):
            import re as _re_yaml

            # Look for lines starting with known keys
            _yaml_lines = []
            _capturing = False
            for line in resp.content.splitlines():
                if _re_yaml.match(
                    r"^(baselines|proposed_methods|ablations|datasets|"
                    r"metrics|objectives|risks|compute_budget)\s*:",
                    line,
                ):
                    _capturing = True
                if _capturing:
                    if line.strip() == "" or line.startswith("```"):
                        continue
                    if line.startswith("#") or line.startswith("**"):
                        continue
                    _yaml_lines.append(line)
            if _yaml_lines:
                try:
                    parsed = yaml.safe_load("\n".join(_yaml_lines))
                except yaml.YAMLError:
                    pass
        if isinstance(parsed, dict):
            plan = parsed
        else:
            logger.warning(
                "Stage 09: LLM response could not be parsed as YAML "
                "(len=%d, first 200 chars: %s). Content extraction method "
                "returned: %s",
                len(resp.content),
                resp.content[:200],
                raw_yaml[:200] if raw_yaml else "<empty>",
            )
            # BUG-12: Retry with a stricter, shorter prompt
            if llm is not None:
                logger.info("Stage 09: Retrying with strict YAML-only prompt...")
                _retry_prompt = (
                    "Output ONLY valid YAML. No prose, no markdown fences, no explanation.\n"
                    f"Topic: {config.research.topic}\n"
                    "Required keys: baselines, proposed_methods, ablations, "
                    "datasets, metrics, objectives, risks, compute_budget.\n"
                    "Each key maps to a list of strings."
                )
                _retry_resp = _chat_with_prompt(
                    llm,
                    "You output ONLY valid YAML. Nothing else.",
                    _retry_prompt,
                    max_tokens=4096,
                )
                try:
                    _retry_parsed = yaml.safe_load(_retry_resp.content)
                    if isinstance(_retry_parsed, dict):
                        plan = _retry_parsed
                        logger.info("Stage 09: Strict YAML retry succeeded.")
                except yaml.YAMLError:
                    pass

    # BUG-12: Fallback 4 — extract method/baseline names from Stage 8 hypotheses
    if plan is None:
        _hyp_text = _read_prior_artifact(run_dir, "hypotheses.md") or ""
        if _hyp_text:
            import re as _re_hyp
            # Extract method-like names from hypothesis text
            _method_candidates = _re_hyp.findall(
                r"(?:proposed|our|novel|new)\s+(?:method|approach|algorithm|framework|model)[:\s]+[\"']?([A-Za-z][\w-]+)",
                _hyp_text, _re_hyp.IGNORECASE,
            )
            _baseline_candidates = _re_hyp.findall(
                r"(?:baseline|compare|existing|standard|traditional)\s+(?:method|approach|model)?[:\s]+[\"']?([A-Za-z][\w-]+)",
                _hyp_text, _re_hyp.IGNORECASE,
            )
            if _method_candidates or _baseline_candidates:
                logger.info(
                    "Stage 09: Extracted names from hypotheses: methods=%s, baselines=%s",
                    _method_candidates[:3], _baseline_candidates[:3],
                )
                plan = {
                    "topic": config.research.topic,
                    "generated": _utcnow_iso(),
                    "objectives": ["Evaluate hypotheses with controlled experiments"],
                    "datasets": ["primary_dataset"],
                    "baselines": _baseline_candidates[:3] or ["baseline_1", "baseline_2"],
                    "proposed_methods": _method_candidates[:3] or ["proposed_method"],
                    "ablations": ["without_key_component", "simplified_version"],
                    "metrics": [config.experiment.metric_key, "secondary_metric"],
                    "risks": ["validity threats", "confounding variables"],
                    "compute_budget": {"max_gpu": 1, "max_hours": 4},
                }

    if plan is None:
        # BUG-12: Use domain-aware names instead of fully generic placeholders
        _topic_prefix = config.research.topic.split()[0] if config.research.topic else "method"
        logger.warning(
            "Stage 09: LLM failed to produce valid experiment plan YAML. "
            "Using topic-derived fallback."
        )
        plan = {
            "topic": config.research.topic,
            "generated": _utcnow_iso(),
            "objectives": ["Evaluate hypotheses with controlled experiments"],
            "datasets": ["primary_dataset", "secondary_dataset"],
            "baselines": [f"{_topic_prefix}_baseline_1", f"{_topic_prefix}_baseline_2"],
            "proposed_methods": [f"{_topic_prefix}_proposed", f"{_topic_prefix}_variant"],
            "ablations": ["without_key_component", "simplified_version"],
            "metrics": [config.experiment.metric_key, "secondary_metric"],
            "risks": ["validity threats", "confounding variables"],
            "compute_budget": {"max_gpu": 1, "max_hours": 4},
        }
    # ── BA: BenchmarkAgent — intelligent dataset/baseline selection ──────
    _benchmark_plan = None
    # BUG-40: Skip BenchmarkAgent for non-ML domains — it has no relevant
    # benchmarks for physics/chemistry/mathematics/etc. and would inject
    # wrong datasets (e.g., CIFAR-10 for PDE topics).
    _ba_domain_id, _, _ = _detect_domain(
        config.research.topic,
        tuple(config.research.domains) if config.research.domains else (),
    )
    _ba_domain_ok = _ba_domain_id == "ml"
    if not _ba_domain_ok:
        logger.info(
            "BenchmarkAgent skipped: domain '%s' is not ML (topic: %s)",
            _ba_domain_id, config.research.topic[:80],
        )
    if (
        _ba_domain_ok
        and config.experiment.benchmark_agent.enabled
        and config.experiment.mode in ("sandbox", "docker")
        and llm is not None
    ):
        try:
            from researchclaw.agents.benchmark_agent import BenchmarkOrchestrator
            from researchclaw.agents.benchmark_agent.orchestrator import (
                BenchmarkAgentConfig as _BACfg,
            )

            _ba_cfg_raw = config.experiment.benchmark_agent
            _ba_cfg = _BACfg(
                enabled=_ba_cfg_raw.enabled,
                enable_hf_search=_ba_cfg_raw.enable_hf_search,
                max_hf_results=_ba_cfg_raw.max_hf_results,
                enable_web_search=_ba_cfg_raw.enable_web_search,
                max_web_results=_ba_cfg_raw.max_web_results,
                web_search_min_local=_ba_cfg_raw.web_search_min_local,
                tier_limit=_ba_cfg_raw.tier_limit,
                min_benchmarks=_ba_cfg_raw.min_benchmarks,
                min_baselines=_ba_cfg_raw.min_baselines,
                prefer_cached=_ba_cfg_raw.prefer_cached,
                max_iterations=_ba_cfg_raw.max_iterations,
            )

            _hw = _load_hardware_profile(run_dir)
            _ba = BenchmarkOrchestrator(
                llm,
                config=_ba_cfg,
                gpu_memory_mb=(
                    _hw.get("gpu_memory_mb", 49000) if _hw else 49000
                ),
                time_budget_sec=config.experiment.time_budget_sec,
                network_policy=(
                    config.experiment.docker.network_policy
                    if config.experiment.mode == "docker"
                    else "full"
                ),
                stage_dir=stage_dir / "benchmark_agent",
            )
            _benchmark_plan = _ba.orchestrate({
                "topic": config.research.topic,
                "hypothesis": hypotheses,
                "experiment_plan": plan.get("objectives", "") if isinstance(plan, dict) else "",
            })

            # Inject BenchmarkAgent selections into experiment plan
            if isinstance(plan, dict) and _benchmark_plan.selected_benchmarks:
                plan["datasets"] = [
                    b.get("name", "Unknown") for b in _benchmark_plan.selected_benchmarks
                ]
                # Normalize existing baselines to list of strings
                # BUG-35: LLM may emit baselines as dict, list of dicts,
                # or list of strings — normalize all to list[str].
                _baselines_from_plan = plan.get("baselines", [])
                if isinstance(_baselines_from_plan, dict):
                    _baselines_from_plan = list(_baselines_from_plan.keys())
                elif isinstance(_baselines_from_plan, list):
                    _baselines_from_plan = [
                        item["name"] if isinstance(item, dict) else str(item)
                        for item in _baselines_from_plan
                    ]
                else:
                    _baselines_from_plan = []
                plan["baselines"] = [
                    bl.get("name", "Unknown") for bl in _benchmark_plan.selected_baselines
                ] + _baselines_from_plan
                # Deduplicate baselines
                plan["baselines"] = list(dict.fromkeys(plan["baselines"]))

            logger.info(
                "BenchmarkAgent: %d benchmarks, %d baselines selected (%d LLM calls, %.1fs)",
                len(_benchmark_plan.selected_benchmarks),
                len(_benchmark_plan.selected_baselines),
                _benchmark_plan.total_llm_calls,
                _benchmark_plan.elapsed_sec,
            )
        except Exception as _ba_exc:
            logger.warning("BenchmarkAgent failed (non-fatal): %s", _ba_exc)

    # Save benchmark plan for code_generation stage
    if _benchmark_plan is not None:
        try:
            (stage_dir / "benchmark_plan.json").write_text(
                json.dumps(_benchmark_plan.to_dict(), indent=2, ensure_ascii=False),
                encoding="utf-8",
            )
        except Exception:  # noqa: BLE001
            pass

    plan.setdefault("topic", config.research.topic)

    # BUG-R41-09: Enforce condition count limit based on time budget.
    # Too many conditions (30+) guarantee timeouts and wasted compute.
    _time_budget = getattr(
        getattr(config, "experiment", None), "time_budget_sec", 3600
    )
    _max_conditions = 8  # default for budgets ≤ 3600s
    if _time_budget > 3600:
        _max_conditions = 12
    if _time_budget > 7200:
        _max_conditions = 20

    _baselines = plan.get("baselines", [])
    if isinstance(_baselines, dict):
        _baselines = list(_baselines.values())
    _proposed = plan.get("proposed_methods", [])
    if isinstance(_proposed, dict):
        _proposed = list(_proposed.values())
    _ablations = plan.get("ablations", [])
    if isinstance(_ablations, dict):
        _ablations = list(_ablations.values())
    _total = len(_baselines) + len(_proposed) + len(_ablations)

    if _total > _max_conditions:
        logger.warning(
            "Stage 9: Plan has %d conditions (limit %d for %ds budget). "
            "Trimming to fit.",
            _total, _max_conditions, _time_budget,
        )
        # Keep all proposed methods (up to max), trim baselines and ablations
        _proposed_count = min(len(_proposed), max(1, _max_conditions - 4))
        _remaining = max(0, _max_conditions - _proposed_count)
        _baseline_budget = max(1, _remaining // 2)
        _ablation_budget = max(0, _remaining - _baseline_budget)
        if len(_proposed) > _proposed_count:
            plan["proposed_methods"] = _proposed[:_proposed_count]
            logger.info(
                "Stage 9: Trimmed proposed methods %d → %d",
                len(_proposed), _proposed_count,
            )

        if len(_baselines) > _baseline_budget:
            plan["baselines"] = _baselines[:_baseline_budget]
            logger.info(
                "Stage 9: Trimmed baselines %d → %d",
                len(_baselines), _baseline_budget,
            )
        if len(_ablations) > _ablation_budget:
            plan["ablations"] = _ablations[:_ablation_budget]
            logger.info(
                "Stage 9: Trimmed ablations %d → %d",
                len(_ablations), _ablation_budget,
            )

    (stage_dir / "exp_plan.yaml").write_text(
        yaml.dump(plan, default_flow_style=False, allow_unicode=True),
        encoding="utf-8",
    )
    return StageResult(
        stage=Stage.EXPERIMENT_DESIGN,
        status=StageStatus.DONE,
        artifacts=("exp_plan.yaml",),
        evidence_refs=("stage-09/exp_plan.yaml",),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_literature.py
================================================
"""Stages 3-6: Search strategy, literature collection, screening, and knowledge extraction."""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

import yaml

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._helpers import (
    StageResult,
    _build_fallback_queries,
    _chat_with_prompt,
    _extract_topic_keywords,
    _extract_yaml_block,
    _get_evolution_overlay,
    _parse_jsonl_rows,
    _read_prior_artifact,
    _safe_filename,
    _safe_json_loads,
    _utcnow_iso,
    _write_jsonl,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Local helpers
# ---------------------------------------------------------------------------


def _expand_search_queries(queries: list[str], topic: str) -> list[str]:
    """Expand search queries for broader literature coverage.

    Generates additional queries by extracting key phrases from the topic
    and creating focused sub-queries. This ensures we find papers even when
    the original queries are too narrow or specific for arXiv.
    """
    expanded = list(queries)  # keep originals
    seen = {q.lower().strip() for q in queries}

    # Extract key phrases from topic by splitting on common delimiters
    # e.g. "Comparing A, B, and C on X with Y" → ["A", "B", "C", "X", "Y"]
    topic_words = topic.split()

    # Generate shorter, broader queries from the topic
    if len(topic_words) > 5:
        # First 5 words as a broader query
        broad = " ".join(topic_words[:5])
        if broad.lower().strip() not in seen:
            expanded.append(broad)
            seen.add(broad.lower().strip())

        # Last 5 words as another perspective
        tail = " ".join(topic_words[-5:])
        if tail.lower().strip() not in seen:
            expanded.append(tail)
            seen.add(tail.lower().strip())

    # Add "survey" and "benchmark" variants of the topic
    for suffix in ("survey", "benchmark", "comparison"):
        # Take first 4 content words + suffix
        short_topic = " ".join(topic_words[:4])
        variant = f"{short_topic} {suffix}"
        if variant.lower().strip() not in seen:
            expanded.append(variant)
            seen.add(variant.lower().strip())

    return expanded


# ---------------------------------------------------------------------------
# Stage executors
# ---------------------------------------------------------------------------


def _execute_search_strategy(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    problem_tree = _read_prior_artifact(run_dir, "problem_tree.md") or ""
    topic = config.research.topic
    plan: dict[str, Any] | None = None
    sources: list[dict[str, Any]] | None = None
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "search_strategy")
        sp = _pm.for_stage("search_strategy", evolution_overlay=_overlay, topic=topic, problem_tree=problem_tree)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        payload = _safe_json_loads(resp.content, {})
        if isinstance(payload, dict):
            yaml_text = str(payload.get("search_plan_yaml", "")).strip()
            if yaml_text:
                try:
                    parsed = yaml.safe_load(_extract_yaml_block(yaml_text))
                except yaml.YAMLError:
                    parsed = None
                if isinstance(parsed, dict):
                    plan = parsed
            src = payload.get("sources", [])
            if isinstance(src, list):
                sources = [item for item in src if isinstance(item, dict)]
    if plan is None:
        # Build smart fallback queries by extracting key terms from topic
        # instead of using the raw (often very long) topic string.
        _fallback_queries = _build_fallback_queries(topic)
        plan = {
            "topic": topic,
            "generated": _utcnow_iso(),
            "search_strategies": [
                {
                    "name": "keyword_core",
                    "queries": _fallback_queries[:5],
                    "sources": ["arxiv", "semantic_scholar", "openreview"],
                    "max_results_per_query": 60,
                },
                {
                    "name": "backward_forward_citation",
                    "queries": _fallback_queries[5:10] or _fallback_queries[:3],
                    "sources": ["semantic_scholar", "google_scholar"],
                    "depth": 1,
                },
            ],
            "filters": {
                "min_year": 2020,
                "language": ["en"],
                "peer_review_preferred": True,
            },
            "deduplication": {"method": "title_doi_hash", "fuzzy_threshold": 0.9},
        }
    if not sources:
        sources = [
            {
                "id": "arxiv",
                "name": "arXiv",
                "type": "api",
                "url": "https://export.arxiv.org/api/query",
                "status": "available",
                "query": topic,
                "verified_at": _utcnow_iso(),
            },
            {
                "id": "semantic_scholar",
                "name": "Semantic Scholar",
                "type": "api",
                "url": "https://api.semanticscholar.org/graph/v1/paper/search",
                "status": "available",
                "query": topic,
                "verified_at": _utcnow_iso(),
            },
        ]
    if config.openclaw_bridge.use_web_fetch:
        for src in sources:
            try:
                response = adapters.web_fetch.fetch(str(src.get("url", "")))
                src["status"] = (
                    "verified"
                    if response.status_code in (200, 301, 302, 405)
                    else "unreachable"
                )
                src["http_status"] = response.status_code
            except Exception:  # noqa: BLE001
                src["status"] = "unknown"
    (stage_dir / "search_plan.yaml").write_text(
        yaml.dump(plan, default_flow_style=False, allow_unicode=True),
        encoding="utf-8",
    )
    (stage_dir / "sources.json").write_text(
        json.dumps(
            {"sources": sources, "count": len(sources), "generated": _utcnow_iso()},
            indent=2,
        ),
        encoding="utf-8",
    )

    # F1.5: Extract queries from plan for Stage 4 real literature search
    queries_list: list[str] = []
    year_min = 2020
    if isinstance(plan, dict):
        strategies = plan.get("search_strategies", [])
        if isinstance(strategies, list):
            for strat in strategies:
                if isinstance(strat, dict):
                    qs = strat.get("queries", [])
                    if isinstance(qs, list):
                        queries_list.extend(str(q) for q in qs if q)
        filters = plan.get("filters", {})
        if isinstance(filters, dict) and filters.get("min_year"):
            try:
                year_min = int(filters["min_year"])
            except (ValueError, TypeError):
                pass

    # --- Sanitize queries: shorten overly long queries ---
    # LLMs often produce the full topic title as a query, which is too long for
    # arXiv and Semantic Scholar (they work best with 3-8 keyword queries).
    _stop = {
        "a", "an", "the", "of", "for", "in", "on", "and", "or", "with",
        "to", "by", "from", "its", "is", "are", "was", "be", "as", "at",
        "via", "using", "based", "study", "analysis", "empirical",
        "towards", "toward", "into", "exploring", "comparison", "tasks",
        "effectiveness", "investigation", "comprehensive", "novel",
    }

    def _extract_keywords(text: str) -> list[str]:
        """Extract meaningful keywords from text, removing stop words."""
        return [
            w for w in re.split(r"[^a-zA-Z0-9]+", text)
            if w.lower() not in _stop and len(w) > 1
        ]

    _MAX_QUERY_LEN = 60  # characters — beyond this, shorten to keywords
    _SEARCH_SUFFIXES = ["benchmark", "survey", "seminal", "state of the art"]

    def _shorten_query(q: str, max_kw: int = 6) -> str:
        """Shorten a query to *max_kw* keywords, preserving any trailing suffix."""
        q_stripped = q.strip()
        # Check if query ends with a known search suffix
        suffix = ""
        q_core = q_stripped
        for sfx in _SEARCH_SUFFIXES:
            if q_stripped.lower().endswith(sfx):
                suffix = sfx
                q_core = q_stripped[: -len(sfx)].strip()
                break
        # Extract keywords from the core part
        kws = _extract_keywords(q_core)
        shortened = " ".join(kws[:max_kw])
        if suffix:
            shortened = f"{shortened} {suffix}"
        return shortened

    if queries_list:
        sanitized: list[str] = []
        for q in queries_list:
            if len(q) > _MAX_QUERY_LEN:
                shortened = _shorten_query(q)
                if shortened.strip():
                    sanitized.append(shortened)
            else:
                sanitized.append(q)
        queries_list = sanitized

    if not queries_list:
        # Build diverse keyword queries from the topic
        _words = _extract_keywords(topic)
        kw_primary = " ".join(_words[:6])
        kw_short = " ".join(_words[:4])
        queries_list = [
            kw_primary,
            f"{kw_short} benchmark",
            f"{kw_short} survey",
        ]

    # Ensure minimum query diversity — if dedup leaves too few, add variants
    _all_kw = _extract_keywords(topic)
    _seen_q: set[str] = set()
    unique_queries: list[str] = []
    for q in queries_list:
        q_lower = q.strip().lower()
        if q_lower and q_lower not in _seen_q:
            _seen_q.add(q_lower)
            unique_queries.append(q.strip())
    # If we have fewer than 5 unique queries, generate supplemental keyword variants
    if len(unique_queries) < 5 and len(_all_kw) >= 3:
        supplements = [
            " ".join(_all_kw[:4]) + " survey",
            " ".join(_all_kw[:4]) + " benchmark",
            " ".join(_all_kw[1:5]),  # shifted window for diversity
            " ".join(_all_kw[:3]) + " comparison",
            " ".join(_all_kw[:3]) + " deep learning",
            " ".join(_all_kw[2:6]),  # another shifted window
        ]
        for s in supplements:
            s_lower = s.strip().lower()
            if s_lower not in _seen_q:
                _seen_q.add(s_lower)
                unique_queries.append(s.strip())
            if len(unique_queries) >= 8:
                break
    queries_list = unique_queries
    (stage_dir / "queries.json").write_text(
        json.dumps({"queries": queries_list, "year_min": year_min}, indent=2),
        encoding="utf-8",
    )
    return StageResult(
        stage=Stage.SEARCH_STRATEGY,
        status=StageStatus.DONE,
        artifacts=("search_plan.yaml", "sources.json", "queries.json"),
        evidence_refs=(
            "stage-03/search_plan.yaml",
            "stage-03/sources.json",
            "stage-03/queries.json",
        ),
    )


def _execute_literature_collect(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    """Stage 4: Collect literature — prefer real APIs, fallback to LLM."""
    topic = config.research.topic

    # Read queries.json from Stage 3 (F1.5 output)
    queries_text = _read_prior_artifact(run_dir, "queries.json")
    queries_data = _safe_json_loads(queries_text or "{}", {})
    queries: list[str] = queries_data.get("queries", [topic])
    year_min: int = queries_data.get("year_min", 2020)

    # --- Try real API search first ---
    candidates: list[dict[str, Any]] = []
    bibtex_entries: list[str] = []
    real_search_succeeded = False

    try:
        from researchclaw.literature.search import (
            search_papers_multi_query,
            papers_to_bibtex,
        )

        # Expand queries for broader coverage
        expanded_queries = _expand_search_queries(queries, config.research.topic)
        logger.info(
            "[literature] Searching %d queries (expanded from %d) "
            "across OpenAlex → S2 → arXiv…",
            len(expanded_queries),
            len(queries),
        )
        papers = search_papers_multi_query(
            expanded_queries,
            limit_per_query=40,
            year_min=year_min,
            s2_api_key=config.llm.s2_api_key,
        )
        if papers:
            real_search_succeeded = True
            # Count by source
            src_counts: dict[str, int] = {}
            for p in papers:
                src_counts[p.source] = src_counts.get(p.source, 0) + 1
                d = p.to_dict()
                d["collected_at"] = _utcnow_iso()
                candidates.append(d)
                bibtex_entries.append(p.to_bibtex())
            src_str = ", ".join(f"{s}: {n}" for s, n in src_counts.items())
            logger.info(
                "[literature] Found %d papers (%s)", len(papers), src_str
            )
    except Exception:  # noqa: BLE001
        logger.warning(
            "[rate-limit] Literature search failed — falling back to LLM",
            exc_info=True,
        )

    # --- Inject foundational/seminal papers ---
    try:
        from researchclaw.data import load_seminal_papers
        seminal = load_seminal_papers(topic)
        if seminal:
            _existing_titles = {c.get("title", "").lower() for c in candidates}
            _injected = 0
            for sp in seminal:
                if sp.get("title", "").lower() not in _existing_titles:
                    candidates.append({
                        "id": f"seminal-{sp.get('cite_key', '')}",
                        "title": sp.get("title", ""),
                        "source": "seminal_library",
                        "url": "",
                        "year": sp.get("year", 2020),
                        "abstract": f"Foundational paper on {', '.join(sp.get('keywords', [])[:3])}.",
                        "authors": [{"name": sp.get("authors", "")}],
                        "cite_key": sp.get("cite_key", ""),
                        "venue": sp.get("venue", ""),
                        "collected_at": _utcnow_iso(),
                    })
                    _injected += 1
            if _injected:
                logger.info("Stage 4: Injected %d seminal papers from seed library", _injected)
    except Exception:  # noqa: BLE001
        logger.debug("Seminal paper injection skipped", exc_info=True)

    # --- Fallback: LLM-generated candidates ---
    if not candidates and llm is not None:
        plan_text = _read_prior_artifact(run_dir, "search_plan.yaml") or ""
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "literature_collect")
        sp = _pm.for_stage("literature_collect", evolution_overlay=_overlay, topic=topic, plan_text=plan_text)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        payload = _safe_json_loads(resp.content, {})
        if isinstance(payload, dict) and isinstance(payload.get("candidates"), list):
            candidates = [row for row in payload["candidates"] if isinstance(row, dict)]

    # --- Web search augmentation (Tavily/DDG + Google Scholar + Crawl4AI) ---
    web_context_parts: list[str] = []
    if config.web_search.enabled:
        try:
            from researchclaw.web.agent import WebSearchAgent
            import os

            tavily_key = config.web_search.tavily_api_key or os.environ.get(
                config.web_search.tavily_api_key_env, ""
            )
            web_agent = WebSearchAgent(
                tavily_api_key=tavily_key,
                enable_scholar=config.web_search.enable_scholar,
                enable_crawling=config.web_search.enable_crawling,
                enable_pdf=config.web_search.enable_pdf_extraction,
                max_web_results=config.web_search.max_web_results,
                max_scholar_results=config.web_search.max_scholar_results,
                max_crawl_urls=config.web_search.max_crawl_urls,
            )
            web_result = web_agent.search_and_extract(
                topic, search_queries=queries,
            )

            # Convert Google Scholar papers into candidates
            for sp in web_result.scholar_papers:
                _existing_titles = {
                    str(c.get("title", "")).lower().strip() for c in candidates
                }
                if sp.title.lower().strip() not in _existing_titles:
                    lit_paper = sp.to_literature_paper()
                    d = lit_paper.to_dict()
                    d["collected_at"] = _utcnow_iso()
                    candidates.append(d)
                    bibtex_entries.append(lit_paper.to_bibtex())

            # Save web search context for downstream stages
            web_context = web_result.to_context_string(max_length=20_000)
            if web_context.strip():
                (stage_dir / "web_context.md").write_text(
                    web_context, encoding="utf-8"
                )
                web_context_parts.append(web_context)

            # Save full web search metadata
            (stage_dir / "web_search_result.json").write_text(
                json.dumps(web_result.to_dict(), indent=2, default=str),
                encoding="utf-8",
            )

            logger.info(
                "[web-search] Added %d scholar papers, %d web results, %d crawled pages",
                len(web_result.scholar_papers),
                len(web_result.web_results),
                len(web_result.crawled_pages),
            )
        except Exception:  # noqa: BLE001
            logger.warning(
                "[web-search] Web search augmentation failed — continuing with academic APIs only",
                exc_info=True,
            )

    # --- Ultimate fallback: placeholder data ---
    # BUG-L2: Do NOT overwrite real_search_succeeded here — it was already
    # set correctly in the search block above. Overwriting would mislabel
    # LLM-hallucinated or seminal papers as "real search" results.
    if not candidates:
        logger.warning("Stage 4: All literature searches failed — using placeholder papers")
        candidates = [
            {
                "id": f"candidate-{idx + 1}",
                "title": f"[Placeholder] Study {idx + 1} on {topic}",
                "source": "arxiv" if idx % 2 == 0 else "semantic_scholar",
                "url": f"https://example.org/{_safe_filename(topic.lower())}/{idx + 1}",
                "year": 2024,
                "abstract": f"This candidate investigates {topic} and reports preliminary findings.",
                "collected_at": _utcnow_iso(),
                "is_placeholder": True,
            }
            for idx in range(max(20, config.research.daily_paper_count or 20))
        ]

    # Write candidates
    out = stage_dir / "candidates.jsonl"
    _write_jsonl(out, candidates)

    # BUG-50 fix: Generate BibTeX from candidates when real search failed
    # (LLM/placeholder fallback paths don't populate bibtex_entries)
    if not bibtex_entries and candidates:
        for c in candidates:
            if c.get("is_placeholder"):
                continue
            _ck = c.get("cite_key", "")
            if not _ck:
                # Derive cite_key from first author surname + year
                _authors = c.get("authors", [])
                _surname = "unknown"
                if isinstance(_authors, list) and _authors:
                    _a0 = _authors[0] if isinstance(_authors[0], str) else (_authors[0].get("name", "") if isinstance(_authors[0], dict) else "")
                    _surname = _a0.split()[-1].lower() if _a0.strip() else "unknown"
                _yr = c.get("year", 2024)
                _title_word = "".join(
                    w[0] for w in str(c.get("title", "study")).split()[:3]
                ).lower()
                _ck = f"{_surname}{_yr}{_title_word}"
            _title = c.get("title", "Untitled")
            _year = c.get("year", 2024)
            _author_str = ""
            _raw_authors = c.get("authors", [])
            if isinstance(_raw_authors, list):
                _names = []
                for _a in _raw_authors:
                    if isinstance(_a, str):
                        _names.append(_a)
                    elif isinstance(_a, dict):
                        _names.append(_a.get("name", ""))
                _author_str = " and ".join(n for n in _names if n)
            bibtex_entries.append(
                f"@article{{{_ck},\n"
                f"  title={{{_title}}},\n"
                f"  author={{{_author_str or 'Unknown'}}},\n"
                f"  year={{{_year}}},\n"
                f"  url={{{c.get('url', '')}}},\n"
                f"}}"
            )
        logger.info(
            "Stage 4: Generated %d BibTeX entries from candidates (fallback)",
            len(bibtex_entries),
        )

    # Write references.bib (F2.4)
    artifacts = ["candidates.jsonl"]
    if web_context_parts:
        artifacts.append("web_context.md")
    if (stage_dir / "web_search_result.json").exists():
        artifacts.append("web_search_result.json")
    if bibtex_entries:
        bib_content = "\n\n".join(bibtex_entries) + "\n"
        (stage_dir / "references.bib").write_text(bib_content, encoding="utf-8")
        artifacts.append("references.bib")
        logger.info(
            "Stage 4: Wrote %d BibTeX entries to references.bib", len(bibtex_entries)
        )

    # Write search metadata
    (stage_dir / "search_meta.json").write_text(
        json.dumps(
            {
                "real_search": real_search_succeeded,
                "queries_used": queries,
                "year_min": year_min,
                "total_candidates": len(candidates),
                "bibtex_entries": len(bibtex_entries),
                "ts": _utcnow_iso(),
            },
            indent=2,
        ),
        encoding="utf-8",
    )
    artifacts.append("search_meta.json")

    return StageResult(
        stage=Stage.LITERATURE_COLLECT,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-04/{a}" for a in artifacts),
    )


def _execute_literature_screen(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl") or ""

    # --- P1-1: keyword relevance pre-filter ---
    # Before LLM screening, drop papers whose title+abstract share no keywords
    # with the research topic.  This catches cross-domain noise cheaply.
    topic_keywords = _extract_topic_keywords(
        config.research.topic, config.research.domains
    )
    filtered_rows: list[dict[str, Any]] = []
    dropped_count = 0
    for raw_line in candidates_text.strip().splitlines():
        row = _safe_json_loads(raw_line, {})
        if not isinstance(row, dict):
            continue
        title = str(row.get("title", "")).lower()
        abstract = str(row.get("abstract", "")).lower()
        text_blob = f"{title} {abstract}"
        overlap = sum(1 for kw in topic_keywords if kw in text_blob)
        # T2.2: Relaxed from ≥2 to ≥1 keyword hit — previous threshold was
        # too aggressive (94% rejection rate).  Single-keyword matches are
        # still screened by the LLM in the next step.
        if overlap >= 1:
            row["keyword_overlap"] = overlap
            filtered_rows.append(row)
        else:
            dropped_count += 1
    # If pre-filter dropped everything, fall back to original (safety valve)
    if not filtered_rows:
        filtered_rows = _parse_jsonl_rows(candidates_text)
    # Rebuild candidates_text from filtered rows
    candidates_text = "\n".join(
        json.dumps(r, ensure_ascii=False) for r in filtered_rows
    )
    logger.info(
        "Domain pre-filter: kept %d, dropped %d (keywords: %s)",
        len(filtered_rows),
        dropped_count,
        topic_keywords[:8],
    )

    shortlist: list[dict[str, Any]] = []
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "literature_screen")
        sp = _pm.for_stage(
            "literature_screen",
            evolution_overlay=_overlay,
            topic=config.research.topic,
            domains=", ".join(config.research.domains)
            if config.research.domains
            else "general",
            quality_threshold=config.research.quality_threshold,
            candidates_text=candidates_text,
        )
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        payload = _safe_json_loads(resp.content, {})
        if isinstance(payload, dict) and isinstance(payload.get("shortlist"), list):
            shortlist = [row for row in payload["shortlist"] if isinstance(row, dict)]
    # T2.2: Ensure minimum shortlist size of 15 for adequate related work
    _MIN_SHORTLIST = 15
    if not shortlist:
        rows = (
            filtered_rows[:_MIN_SHORTLIST]
            if filtered_rows
            else _parse_jsonl_rows(candidates_text)[:_MIN_SHORTLIST]
        )
        for idx, item in enumerate(rows):
            item["relevance_score"] = round(0.75 - idx * 0.02, 3)
            item["quality_score"] = round(0.72 - idx * 0.015, 3)
            item["keep_reason"] = "Template screened entry"
            shortlist.append(item)
    elif len(shortlist) < _MIN_SHORTLIST:
        # T2.2: LLM returned too few — supplement from filtered candidates
        existing_titles = {
            str(s.get("title", "")).lower().strip() for s in shortlist
        }
        for row in filtered_rows:
            if len(shortlist) >= _MIN_SHORTLIST:
                break
            title_lower = str(row.get("title", "")).lower().strip()
            if title_lower and title_lower not in existing_titles:
                row.setdefault("relevance_score", 0.5)
                row.setdefault("quality_score", 0.5)
                row.setdefault("keep_reason", "Supplemented to meet minimum shortlist")
                shortlist.append(row)
                existing_titles.add(title_lower)
        logger.info(
            "Stage 5: Supplemented shortlist to %d papers (minimum: %d)",
            len(shortlist), _MIN_SHORTLIST,
        )
    out = stage_dir / "shortlist.jsonl"
    _write_jsonl(out, shortlist)
    return StageResult(
        stage=Stage.LITERATURE_SCREEN,
        status=StageStatus.DONE,
        artifacts=("shortlist.jsonl",),
        evidence_refs=("stage-05/shortlist.jsonl",),
    )


def _execute_knowledge_extract(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    shortlist = _read_prior_artifact(run_dir, "shortlist.jsonl") or ""

    # Inject web context from Stage 4 if available
    web_context = _read_prior_artifact(run_dir, "web_context.md") or ""
    if web_context:
        shortlist = shortlist + "\n\n--- Web Search Context ---\n" + web_context[:10_000]

    cards_dir = stage_dir / "cards"
    cards_dir.mkdir(parents=True, exist_ok=True)
    cards: list[dict[str, Any]] = []
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "knowledge_extract")
        sp = _pm.for_stage("knowledge_extract", evolution_overlay=_overlay, shortlist=shortlist)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        payload = _safe_json_loads(resp.content, {})
        if isinstance(payload, dict) and isinstance(payload.get("cards"), list):
            cards = [item for item in payload["cards"] if isinstance(item, dict)]
    if not cards:
        rows = _parse_jsonl_rows(shortlist)
        for idx, paper in enumerate(rows[:6]):
            title = str(paper.get("title", f"Paper {idx + 1}"))
            cards.append(
                {
                    "card_id": f"card-{idx + 1}",
                    "title": title,
                    "problem": f"How to improve {config.research.topic}",
                    "method": "Template method summary",
                    "data": "Template dataset",
                    "metrics": "Template metric",
                    "findings": "Template key finding",
                    "limitations": "Template limitation",
                    "citation": str(paper.get("url", "")),
                    "cite_key": str(paper.get("cite_key", "")),
                }
            )
    for idx, card in enumerate(cards):
        card_id = _safe_filename(str(card.get("card_id", f"card-{idx + 1}")))
        parts = [f"# {card.get('title', card_id)}", ""]
        for key in (
            "cite_key",
            "problem",
            "method",
            "data",
            "metrics",
            "findings",
            "limitations",
            "citation",
        ):
            parts.append(f"## {key.title()}")
            parts.append(str(card.get(key, "")))
            parts.append("")
        (cards_dir / f"{card_id}.md").write_text("\n".join(parts), encoding="utf-8")
    return StageResult(
        stage=Stage.KNOWLEDGE_EXTRACT,
        status=StageStatus.DONE,
        artifacts=("cards/",),
        evidence_refs=("stage-06/cards/",),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_paper_writing.py
================================================
"""Stages 16-17: Paper outline and paper draft generation."""

from __future__ import annotations

import json
import logging
import math
import re
from pathlib import Path
from typing import Any

import yaml

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain, _is_ml_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _build_context_preamble,
    _chat_with_prompt,
    _collect_experiment_results,
    _default_paper_outline,
    _extract_paper_title,
    _generate_framework_diagram_prompt,
    _generate_neurips_checklist,
    _get_evolution_overlay,
    _read_best_analysis,
    _read_prior_artifact,
    _safe_json_loads,
    _topic_constraint_block,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_paper_outline(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    analysis = _read_best_analysis(run_dir)
    decision = _read_prior_artifact(run_dir, "decision.md") or ""
    preamble = _build_context_preamble(
        config,
        run_dir,
        include_analysis=True,
        include_decision=True,
        include_experiment_data=True,
    )

    # WS-5.2: Read iteration feedback if available (multi-round iteration)
    feedback = ""
    iter_ctx_path = run_dir / "iteration_context.json"
    if iter_ctx_path.exists():
        try:
            ctx = json.loads(iter_ctx_path.read_text(encoding="utf-8"))
            iteration = ctx.get("iteration", 1)
            prev_score = ctx.get("quality_score")
            reviews_excerpt = ctx.get("reviews_excerpt", "")
            if iteration > 1 and reviews_excerpt:
                feedback = (
                    f"\n\n## Iteration {iteration} Feedback\n"
                    f"Previous quality score: {prev_score}/10\n"
                    f"Reviewer feedback to address:\n{reviews_excerpt[:2000]}\n"
                    f"\nYou MUST address these reviewer concerns in this revision.\n"
                )
        except (json.JSONDecodeError, KeyError):
            pass

    if llm is not None:
        _pm = prompts or PromptManager()
        # IMP-20: Pass academic style guide block for outline stage
        try:
            _asg = _pm.block("academic_style_guide")
        except (KeyError, Exception):
            _asg = ""
        _overlay = _get_evolution_overlay(run_dir, "paper_outline")
        sp = _pm.for_stage(
            "paper_outline",
            evolution_overlay=_overlay,
            preamble=preamble,
            topic_constraint=_pm.block("topic_constraint", topic=config.research.topic),
            feedback=feedback,
            analysis=analysis,
            decision=decision,
            academic_style_guide=_asg,
        )
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        outline = resp.content
        # Reasoning models may consume all tokens on CoT — retry with more
        if not outline.strip() and sp.max_tokens:
            logger.warning("Empty outline from LLM — retrying with 2x tokens")
            resp = _chat_with_prompt(
                llm,
                sp.system,
                sp.user,
                json_mode=sp.json_mode,
                max_tokens=sp.max_tokens * 2,
            )
            outline = resp.content
        if not outline.strip():
            logger.warning("LLM returned empty outline — using default")
            outline = _default_paper_outline(config.research.topic)
    else:
        outline = _default_paper_outline(config.research.topic)
    (stage_dir / "outline.md").write_text(outline, encoding="utf-8")
    return StageResult(
        stage=Stage.PAPER_OUTLINE,
        status=StageStatus.DONE,
        artifacts=("outline.md",),
        evidence_refs=("stage-16/outline.md",),
    )


def _collect_raw_experiment_metrics(run_dir: Path) -> tuple[str, bool]:
    """Collect raw experiment metric lines from stdout for paper writing.

    Returns a tuple of (formatted block, has_parsed_metrics).
    ``has_parsed_metrics`` is True when at least one run had a non-empty
    ``metrics`` dict in its JSON payload — a reliable signal of real data.
    """
    metric_lines: list[str] = []
    run_count = 0
    has_parsed_metrics = False

    for stage_subdir in sorted(run_dir.glob("stage-*/runs")):
        for run_file in sorted(stage_subdir.glob("*.json")):
            if run_file.name == "results.json":
                continue
            try:
                payload = json.loads(run_file.read_text(encoding="utf-8"))
            except (json.JSONDecodeError, OSError):
                continue
            if not isinstance(payload, dict):
                continue

            # R10: Skip simulated data — only collect real experiment results
            if payload.get("status") == "simulated":
                continue

            run_count += 1

            # Extract from parsed metrics (check both 'metrics' and 'key_metrics')
            metrics = payload.get("metrics", {}) or payload.get("key_metrics", {})
            if isinstance(metrics, dict) and metrics:
                has_parsed_metrics = True
                for k, v in metrics.items():
                    metric_lines.append(f"  {k}: {v}")

            # Also extract from stdout for full detail
            # BUG-23: Filter out infrastructure lines that are NOT experiment results
            _INFRA_KEYS = {
                "SEED_COUNT", "TIME_ESTIMATE", "TRAINING_STEPS",
                "REGISTERED_CONDITIONS", "METRIC_DEF", "GPU_MEMORY",
                "BATCH_SIZE", "NUM_WORKERS", "TOTAL_PARAMS",
                "time_budget_sec", "max_epochs", "num_seeds",
            }
            stdout = payload.get("stdout", "")
            if stdout:
                for line in stdout.splitlines():
                    line = line.strip()
                    if ":" in line:
                        parts = line.rsplit(":", 1)
                        try:
                            float(parts[1].strip())
                            key_part = parts[0].strip().split("/")[-1]  # last segment
                            if key_part in _INFRA_KEYS:
                                continue  # skip infrastructure lines
                            metric_lines.append(f"  {line}")
                        except (ValueError, TypeError, IndexError):
                            pass

    # R19-4 + R23-1: Collect metrics from refinement_log.json (Stage 13).
    # If refinement has richer data than Stage 12 runs/, REPLACE Stage 12 data
    # to avoid confusing the paper writer with conflicting sources.
    _refine_lines: list[str] = []
    _refine_run_count = 0
    # Scan ALL refinement logs across versions, pick by quality (primary
    # metric) then richness (metric count).  BUG-207: Previous logic picked
    # the sandbox entry with the most metric keys regardless of whether it
    # represented a regression (e.g. sandbox_after_fix with 1.29% accuracy
    # winning over sandbox with 78.93% because it had 6 more keys).
    _best_refine_metrics: dict[str, Any] = {}
    _best_refine_stdout = ""
    _best_refine_primary: float | None = None
    for _rl_path in sorted(run_dir.glob("stage-13*/refinement_log.json")):
        try:
            _rlog = json.loads(_rl_path.read_text(encoding="utf-8"))
            for _it in _rlog.get("iterations", []):
                for _sbx_key in ("sandbox", "sandbox_after_fix"):
                    _sbx = _it.get(_sbx_key, {})
                    if not isinstance(_sbx, dict):
                        continue
                    _sbx_metrics = _sbx.get("metrics", {})
                    if not isinstance(_sbx_metrics, dict) or not _sbx_metrics:
                        continue
                    # Extract primary metric value for quality comparison
                    _sbx_primary: float | None = None
                    for _pm_key in ("primary_metric", "best_metric"):
                        if _pm_key in _sbx_metrics:
                            try:
                                _sbx_primary = float(_sbx_metrics[_pm_key])
                            except (ValueError, TypeError):
                                pass
                            break
                    # Prefer higher primary metric; fall back to count
                    _dominated = False
                    if _best_refine_primary is not None and _sbx_primary is not None:
                        if _sbx_primary > _best_refine_primary:
                            _dominated = True  # new is better
                        elif _sbx_primary < _best_refine_primary * 0.5:
                            continue  # skip: regression (>50% worse)
                    # Accept if quality-dominant or richer-with-no-regression
                    if _dominated or len(_sbx_metrics) > len(_best_refine_metrics):
                        _best_refine_metrics = _sbx_metrics
                        _best_refine_stdout = _sbx.get("stdout", "")
                        _best_refine_primary = _sbx_primary
        except (json.JSONDecodeError, OSError):
            pass

    if _best_refine_metrics and len(_best_refine_metrics) > len(metric_lines) // 2:
        # Refinement has richer data — REPLACE Stage 12 data to avoid conflicts
        metric_lines = []
        run_count = 1
        for k, v in _best_refine_metrics.items():
            metric_lines.append(f"  {k}: {v}")
        # Also extract PAIRED and metric lines from stdout
        if _best_refine_stdout:
            for _line in _best_refine_stdout.splitlines():
                _line = _line.strip()
                if _line.startswith("PAIRED:"):
                    metric_lines.append(f"  {_line}")
                elif ":" in _line:
                    parts = _line.rsplit(":", 1)
                    try:
                        float(parts[1].strip())
                        metric_lines.append(f"  {_line}")
                    except (ValueError, TypeError, IndexError):
                        pass
    elif _best_refine_metrics:
        # Refinement has some data but not richer — append to existing
        run_count += 1
        for k, v in _best_refine_metrics.items():
            metric_lines.append(f"  {k}: {v}")
        if _best_refine_stdout:
            for _line in _best_refine_stdout.splitlines():
                _line = _line.strip()
                if _line.startswith("PAIRED:"):
                    metric_lines.append(f"  {_line}")

    if not metric_lines:
        return "", has_parsed_metrics

    # Deduplicate while preserving order
    seen: set[str] = set()
    unique: list[str] = []
    for line in metric_lines:
        if line not in seen:
            seen.add(line)
            unique.append(line)

    # BUG-29: Reformat raw metric lines into human-readable condition summaries
    # to prevent LLM from pasting raw path-style lines into the paper
    _grouped: dict[str, list[str]] = {}
    _ungrouped: list[str] = []
    for line in unique[:200]:
        stripped = line.strip()
        # Match pattern: condition/env/step/metric: value
        parts = stripped.split("/")
        if len(parts) >= 3 and ":" in parts[-1]:
            cond = parts[0]
            detail = "/".join(parts[1:])
            _grouped.setdefault(cond, []).append(f"  - {detail}")
        else:
            _ungrouped.append(stripped)

    formatted_lines: list[str] = []
    if _grouped:
        for cond, details in sorted(_grouped.items()):
            formatted_lines.append(f"## Condition: {cond}")
            formatted_lines.extend(details[:30])
    if _ungrouped:
        formatted_lines.extend(_ungrouped)

    return (
        f"\n\nACTUAL EXPERIMENT DATA (from {run_count} run(s) — use ONLY these numbers):\n"
        "```\n"
        + "\n".join(formatted_lines[:200])
        + "\n```\n"
        "CRITICAL: Every number in the Results table MUST come from the data above. "
        "Do NOT round excessively, do NOT invent numbers, do NOT change values. "
        f"The experiment ran {run_count} time(s) — state this accurately in the methodology.\n"
        "NEVER paste raw metric paths (like 'condition/env/step/metric: value') "
        "into the paper. Always convert to formatted LaTeX tables or inline prose.\n"
    ), has_parsed_metrics


def _write_paper_sections(
    *,
    llm: LLMClient,
    pm: PromptManager,
    run_dir: Path | None = None,
    preamble: str,
    topic_constraint: str,
    exp_metrics_instruction: str,
    citation_instruction: str,
    outline: str,
    model_name: str = "",
) -> str:
    """Write a conference-grade paper in 3 sequential LLM calls.

    Call 1: Title + Abstract + Introduction + Related Work
    Call 2: Method + Experiments (with full experiment data)
    Call 3: Results + Discussion + Limitations + Conclusion

    Each call receives prior sections for coherence.
    """
    # Render writing_structure block for injection
    try:
        _writing_structure = pm.block("writing_structure")
    except (KeyError, Exception):  # noqa: BLE001
        _writing_structure = ""

    _overlay = _get_evolution_overlay(run_dir, "paper_draft")
    system = pm.for_stage(
        "paper_draft",
        evolution_overlay=_overlay,
        preamble=preamble,
        topic_constraint=topic_constraint,
        exp_metrics_instruction=exp_metrics_instruction,
        citation_instruction=citation_instruction,
        writing_structure=_writing_structure,
        outline=outline,
    ).system

    sections: list[str] = []

    # --- R4-3: Title guidelines and abstract structure ---
    try:
        title_guidelines = pm.block("title_guidelines")
    except (KeyError, Exception):  # noqa: BLE001
        title_guidelines = ""
    try:
        abstract_structure = pm.block("abstract_structure")
    except (KeyError, Exception):  # noqa: BLE001
        abstract_structure = ""

    # IMP-20/25/31/24: Academic style, narrative, anti-hedging, anti-repetition
    try:
        academic_style_guide = pm.block("academic_style_guide")
    except (KeyError, Exception):  # noqa: BLE001
        academic_style_guide = ""
    try:
        narrative_writing_rules = pm.block("narrative_writing_rules")
    except (KeyError, Exception):  # noqa: BLE001
        narrative_writing_rules = ""
    try:
        anti_hedging_rules = pm.block("anti_hedging_rules")
    except (KeyError, Exception):  # noqa: BLE001
        anti_hedging_rules = ""
    try:
        anti_repetition_rules = pm.block("anti_repetition_rules")
    except (KeyError, Exception):  # noqa: BLE001
        anti_repetition_rules = ""

    # --- Call 1: Title + Abstract + Introduction + Related Work ---
    call1_user = (
        f"{preamble}\n\n"
        f"{topic_constraint}"
        f"{citation_instruction}\n\n"
        f"{title_guidelines}\n\n"
        f"{academic_style_guide}\n"
        f"{narrative_writing_rules}\n"
        f"{anti_hedging_rules}\n"
        f"{anti_repetition_rules}\n\n"
        "Write the following sections of a NeurIPS/ICML-quality paper in markdown. "
        "Follow the LENGTH REQUIREMENTS strictly:\n\n"
        "1. **Title** (HARD RULE: MUST be 14 words or fewer. Create a catchy method name "
        "first, then build the title: 'MethodName: Subtitle'. If your title exceeds 14 words, "
        "it will be automatically rejected. NEVER use 'Untitled Paper'.)\n"
        f"2. **Abstract** (150-220 words — HARD LIMIT. Do NOT exceed 220 words. "
        f"Do NOT include raw metric paths or 16-digit decimals.){abstract_structure}\n"
        "3. **Introduction** (800-1000 words): real-world motivation, problem statement, "
        "research gap analysis with citations, method overview, 3-4 contributions as bullet points, "
        "paper organization paragraph. MUST cite 8-12 references.\n"
        "4. **Related Work** (600-800 words): organized into 3-4 thematic subsections, each discussing "
        "4-5 papers with proper citations. Compare approaches, identify limitations, position this work.\n\n"
        f"Outline:\n{outline}\n\n"
        "Output markdown with ## headers. Do NOT include a References section.\n"
        "IMPORTANT: Start DIRECTLY with '## Title'. Do NOT include any preamble, "
        "data verification, condition listing, or metric enumeration before the title. "
        "The paper should read like a published manuscript, not a data report."
    )
    # R14-1: Higher token limit for reasoning models
    _paper_max_tokens = 12000
    if any(model_name.startswith(p) for p in ("gpt-5", "o3", "o4")):
        _paper_max_tokens = 24000

    # T3.5: Retry once on failure, use placeholder if still fails
    try:
        resp1 = _chat_with_prompt(llm, system, call1_user, max_tokens=_paper_max_tokens, retries=1)
        part1 = resp1.content.strip()
    except Exception:  # noqa: BLE001
        logger.error("Stage 17: Part 1 LLM call failed after retry — using placeholder")
        part1 = (
            "## Title\n[PLACEHOLDER — LLM call failed]\n\n"
            "## Abstract\n[This section could not be generated due to an LLM error. "
            "Please regenerate this stage.]\n\n"
            "## Introduction\n[PLACEHOLDER]\n\n"
            "## Related Work\n[PLACEHOLDER]"
        )
    sections.append(part1)
    logger.info("Stage 17: Part 1 (Title+Abstract+Intro+Related Work) — %d chars", len(part1))

    # --- Call 2: Method + Experiments ---
    call2_user = (
        f"{preamble}\n\n"
        f"{topic_constraint}"
        f"{exp_metrics_instruction}\n\n"
        f"{narrative_writing_rules}\n"
        f"{anti_hedging_rules}\n\n"
        # IMP-21: Citation instruction for Method + Experiments
        "CITATION REQUIREMENT: The Method section MUST cite at least 3-5 related "
        "technical papers (foundations your method builds on). The Experiments section "
        "MUST cite baseline method papers. Use [cite_key] syntax.\n"
        f"{citation_instruction}\n\n"
        "You are continuing a paper. The sections written so far are:\n\n"
        f"---\n{part1}\n---\n\n"
        "Now write the next sections, maintaining consistency with the above:\n\n"
        "5. **Method** (1000-1500 words): formal problem definition with mathematical notation "
        "($x$, $\\theta$, etc.), detailed algorithm description with equations, step-by-step procedure, "
        "complexity analysis, design rationale for key choices. Include algorithm pseudocode if applicable. "
        "Write as FLOWING PROSE — do NOT use bullet-point lists for method components.\n"
        "6. **Experiments** (800-1200 words): detailed experimental setup, datasets with statistics "
        "(size, splits, features), all baselines and their implementations, hyperparameter settings "
        "in a markdown table, evaluation metrics with mathematical definitions, hardware and runtime info.\n"
        "METHOD NAMES IN TABLES: Use SHORT abbreviations (4-8 chars) for method names "
        "in tables. Define abbreviation mappings in a footnote. "
        "NEVER put method names longer than 20 characters in table cells.\n\n"
        f"Outline:\n{outline}\n\n"
        "Output markdown with ## headers. Continue from where Part 1 ended."
    )
    try:
        resp2 = _chat_with_prompt(llm, system, call2_user, max_tokens=_paper_max_tokens, retries=1)
        part2 = resp2.content.strip()
    except Exception:  # noqa: BLE001
        logger.error("Stage 17: Part 2 LLM call failed after retry — using placeholder")
        part2 = (
            "## Method\n[PLACEHOLDER — LLM call failed. Please regenerate this stage.]\n\n"
            "## Experiments\n[PLACEHOLDER]"
        )
    sections.append(part2)
    logger.info("Stage 17: Part 2 (Method+Experiments) — %d chars", len(part2))

    # --- Call 3: Results + Discussion + Limitations + Conclusion ---
    call3_user = (
        f"{preamble}\n\n"
        f"{topic_constraint}"
        f"{exp_metrics_instruction}\n\n"
        f"{narrative_writing_rules}\n"
        f"{anti_hedging_rules}\n"
        f"{anti_repetition_rules}\n\n"
        # IMP-21: Citation instruction for Results + Discussion + Conclusion
        "CITATION REQUIREMENT: The Discussion section MUST cite at least 3-5 papers "
        "when comparing findings with prior work. The Conclusion may cite 1-2 "
        "foundational references.\n"
        f"{citation_instruction}\n\n"
        "You are completing a paper. The sections written so far are:\n\n"
        f"---\n{part1}\n\n{part2}\n---\n\n"
        "Now write the final sections, maintaining consistency:\n\n"
        "7. **Results** (600-800 words):\n"
        "   - START with an AGGREGATED results table (Table 1): rows = methods, columns = metrics.\n"
        "     Each cell = mean \u00b1 std across seeds. Bold the best value per column.\n"
        "     EVERY table MUST have a descriptive caption that allows understanding without "
        "     reading the main text. NEVER use just 'Table 1' as a caption.\n"
        "   - Follow with a PER-REGIME table (Table 2) breaking down by easy/hard regimes.\n"
        "   - Include a STATISTICAL COMPARISON table (Table 3): paired t-tests between key methods.\n"
        "   - NEVER dump raw per-seed numbers in the main text. Aggregate first, then discuss.\n"
        "   - MUST include at least 2 figures using markdown image syntax: ![Caption](charts/filename.png)\n"
        "     One figure MUST be a performance comparison chart. Figures MUST be referenced "
        "     in text: 'As shown in Figure 1, ...'\n"
        "8. **Discussion** (400-600 words): interpretation of key findings, unexpected results, "
        "comparison with prior work (CITE 3-5 papers here!), practical implications.\n"
        "9. **Limitations** (200-300 words): honest assessment of scope, dataset, methodology. "
        "ALL caveats consolidated HERE — nowhere else in the paper.\n"
        "10. **Conclusion** (100-200 words MAXIMUM — this is a HARD LIMIT): "
        "Summarize contributions in 2-3 sentences. State main finding in 1 sentence. "
        "Suggest 2-3 concrete future directions in 1-2 sentences. "
        "Do NOT repeat any specific numbers from Results. Do NOT restate the abstract. "
        "A good conclusion is SHORT and forward-looking.\n\n"
        "CRITICAL FORMATTING RULES FOR ALL SECTIONS:\n"
        "- Write as FLOWING PROSE paragraphs, NOT bullet-point lists\n"
        "- NEVER dump raw metric paths like 'config/method_name/seed_3/primary_metric'\n"
        "- All numbers must be rounded to 4 decimal places maximum\n"
        "- Every table MUST have a descriptive caption (not just 'Table 1')\n"
        "- Use \\begin{algorithm} or pseudocode notation, NOT \\begin{verbatim}\n\n"
        "Output markdown with ## headers. Do NOT include a References section."
    )
    try:
        resp3 = _chat_with_prompt(llm, system, call3_user, max_tokens=_paper_max_tokens, retries=1)
        part3 = resp3.content.strip()
    except Exception:  # noqa: BLE001
        logger.error("Stage 17: Part 3 LLM call failed after retry — using placeholder")
        part3 = (
            "## Results\n[PLACEHOLDER — LLM call failed. Please regenerate this stage.]\n\n"
            "## Discussion\n[PLACEHOLDER]\n\n"
            "## Limitations\n[PLACEHOLDER]\n\n"
            "## Conclusion\n[PLACEHOLDER]"
        )
    sections.append(part3)
    logger.info("Stage 17: Part 3 (Results+Discussion+Limitations+Conclusion) — %d chars", len(part3))

    # Combine all sections
    draft = "\n\n".join(sections)

    # R32: Strip data verification preamble that LLMs sometimes emit before
    # the actual paper.  The preamble typically starts with "## Tested Conditions"
    # or similar headings and ends before "## Title".
    import re as _re_strip
    _title_match = _re_strip.search(r"^## Title\b", draft, _re_strip.MULTILINE)
    if _title_match and _title_match.start() > 200:
        _stripped = draft[_title_match.start():]
        logger.info(
            "R32: Stripped %d-char preamble before '## Title'",
            _title_match.start(),
        )
        draft = _stripped

    total_words = len(draft.split())
    logger.info("Stage 17: Full draft — %d chars, ~%d words", len(draft), total_words)

    return draft


# ---------------------------------------------------------------------------
# Draft quality validation (section balance + bullet-point density)
# ---------------------------------------------------------------------------

# Sections where bullets/numbered lists are acceptable.
_BULLET_LENIENT_SECTIONS = frozenset({
    "introduction", "limitations", "limitation",
    "limitations and future work", "abstract",
})

# Main body sections used for balance ratio check.
_BALANCE_SECTIONS = frozenset({
    "introduction", "related work", "method", "experiments", "results",
    "discussion",
})


def _validate_draft_quality(
    draft: str,
    stage_dir: Path | None = None,
) -> dict[str, Any]:
    """Validate a paper draft for section balance and prose quality.

    Checks:
    1. Per-section word count vs ``SECTION_WORD_TARGETS``.
    2. Bullet-point / numbered-list density per section.
    3. Largest-to-smallest main-section word-count ratio.

    Returns a dict with ``section_analysis``, ``overall_warnings``, and
    ``revision_directives``.  Optionally writes ``draft_quality.json`` to
    *stage_dir*.
    """
    from researchclaw.prompts import SECTION_WORD_TARGETS, _SECTION_TARGET_ALIASES

    _heading_re = re.compile(r"^(#{1,4})\s+(.+)$", re.MULTILINE)
    matches = list(_heading_re.finditer(draft))

    sections_data: list[dict[str, Any]] = []
    for i, m in enumerate(matches):
        level = len(m.group(1))
        heading = m.group(2).strip()
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(draft)
        body = draft[start:end].strip()
        sections_data.append({
            "heading": heading,
            "heading_lower": heading.strip().lower(),
            "level": level,
            "body": body,
        })

    section_analysis: list[dict[str, Any]] = []
    overall_warnings: list[str] = []
    revision_directives: list[str] = []
    main_section_words: dict[str, int] = {}

    _bullet_re = re.compile(r"^\s*[-*]\s+", re.MULTILINE)
    _numbered_re = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)

    # BUG-24: Accumulate subsection (H3+) word counts into parent H2 sections
    _subsection_words: dict[str, int] = {}
    _current_parent = ""
    for sec in sections_data:
        if sec["level"] <= 2:
            _current_parent = sec["heading_lower"]
            _subsection_words.setdefault(_current_parent, 0)
        else:
            # Add subsection words to parent
            _subsection_words[_current_parent] = (
                _subsection_words.get(_current_parent, 0) + len(sec["body"].split())
            )

    for sec in sections_data:
        if sec["level"] > 2:
            continue
        heading_lower: str = sec["heading_lower"]
        body: str = sec["body"]
        # BUG-24: Include subsection words in the parent's word count
        word_count = len(body.split()) + _subsection_words.get(heading_lower, 0)
        canon = heading_lower
        if canon not in SECTION_WORD_TARGETS:
            canon = _SECTION_TARGET_ALIASES.get(heading_lower, "")
        entry: dict[str, Any] = {
            "heading": sec["heading"],
            "word_count": word_count,
            "canonical": canon,
        }
        if canon and canon in SECTION_WORD_TARGETS:
            lo, hi = SECTION_WORD_TARGETS[canon]
            entry["target"] = [lo, hi]
            if word_count < int(lo * 0.7):
                overall_warnings.append(
                    f"{sec['heading']} is severely under target "
                    f"({word_count} words, target {lo}-{hi})"
                )
                revision_directives.append(
                    f"EXPAND {sec['heading']} from {word_count} to {lo}+ words. "
                    f"Add substantive content \u2014 do NOT pad with filler."
                )
                entry["status"] = "severely_short"
            elif word_count < lo:
                overall_warnings.append(
                    f"{sec['heading']} is under target "
                    f"({word_count} words, target {lo}-{hi})"
                )
                revision_directives.append(
                    f"Expand {sec['heading']} from {word_count} to {lo}+ words."
                )
                entry["status"] = "short"
            elif word_count > int(hi * 1.3):
                overall_warnings.append(
                    f"{sec['heading']} exceeds target "
                    f"({word_count} words, target {lo}-{hi})"
                )
                revision_directives.append(
                    f"Compress {sec['heading']} from {word_count} to {hi} words or fewer."
                )
                entry["status"] = "long"
            else:
                entry["status"] = "ok"
        if body:
            total_lines = len([ln for ln in body.splitlines() if ln.strip()])
            bullet_lines = len(_bullet_re.findall(body)) + len(_numbered_re.findall(body))
            density = bullet_lines / total_lines if total_lines > 0 else 0.0
            entry["bullet_density"] = round(density, 2)
            threshold = 0.50 if heading_lower in _BULLET_LENIENT_SECTIONS else 0.25
            if density > threshold and total_lines >= 4:
                overall_warnings.append(
                    f"{sec['heading']} has {bullet_lines}/{total_lines} "
                    f"bullet/numbered lines ({density:.0%} density, "
                    f"threshold {threshold:.0%})"
                )
                revision_directives.append(
                    f"REWRITE {sec['heading']} as flowing academic prose. "
                    f"Convert bullet points to narrative paragraphs."
                )
                entry["bullet_status"] = "high"
            else:
                entry["bullet_status"] = "ok"
        canon_balance = canon or heading_lower
        if canon_balance in _BALANCE_SECTIONS:
            main_section_words[canon_balance] = word_count
        section_analysis.append(entry)

    if len(main_section_words) >= 2:
        wc_values = list(main_section_words.values())
        max_wc = max(wc_values)
        min_wc = min(wc_values)
        if min_wc > 0 and max_wc / min_wc > 3.0:
            largest = max(main_section_words, key=main_section_words.get)  # type: ignore[arg-type]
            smallest = min(main_section_words, key=main_section_words.get)  # type: ignore[arg-type]
            overall_warnings.append(
                f"Section imbalance: {largest} ({max_wc} words) vs "
                f"{smallest} ({min_wc} words) \u2014 ratio {max_wc / min_wc:.1f}x"
            )
            revision_directives.append(
                f"Rebalance sections: expand {smallest} and/or compress {largest} "
                f"to achieve more even section lengths."
            )

    # --- C-4/C-5: Citation count and recency checks ---
    _cite_pattern = re.compile(r"\[([a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9]*)\]")
    cited_keys = set(_cite_pattern.findall(draft))
    if cited_keys:
        n_citations = len(cited_keys)
        if n_citations < 15:
            overall_warnings.append(
                f"Only {n_citations} unique citations found (target: >=15 for a full paper)"
            )
            revision_directives.append(
                f"Add more references — a top-venue paper typically cites 25-40 works. "
                f"Currently only {n_citations} unique citations."
            )
        # Check recency: count citations with year >= current_year - 2
        _year_pat = re.compile(r"(\d{4})")
        import datetime as _dt_cit
        _cur_year = _dt_cit.datetime.now().year
        recent_count = sum(
            1 for k in cited_keys
            for m in [_year_pat.search(k)]
            if m and int(m.group(1)) >= _cur_year - 2
        )
        recency_ratio = recent_count / n_citations if n_citations > 0 else 0.0
        if recency_ratio < 0.3 and n_citations >= 10:
            overall_warnings.append(
                f"Citation recency low: only {recent_count}/{n_citations} "
                f"({recency_ratio:.0%}) from last 3 years (target: >=30%%)"
            )

    # --- Abstract and Conclusion length enforcement ---
    for sec in sections_data:
        hl = sec["heading_lower"]
        body_text: str = sec["body"]
        wc = len(body_text.split())
        if hl == "abstract" and wc > 250:
            overall_warnings.append(
                f"Abstract is too long: {wc} words (target: 150-220 words)"
            )
            revision_directives.append(
                f"COMPRESS the Abstract from {wc} to 150-220 words. "
                f"Remove raw metric values, redundant context, and self-references."
            )
        if hl in ("conclusion", "conclusions", "conclusion and future work"):
            if wc > 300:
                overall_warnings.append(
                    f"Conclusion is too long: {wc} words (target: 100-200 words)"
                )
                revision_directives.append(
                    f"COMPRESS the Conclusion from {wc} to 100-200 words. "
                    f"Do NOT repeat specific metric values from Results. "
                    f"Summarize findings in 2-3 sentences, then 2-3 future directions."
                )

    # --- Raw metric path detection (log dumps in prose) ---
    _raw_path_re = re.compile(
        r"\\texttt\{[a-zA-Z0-9_/.-]+(?:/[a-zA-Z0-9_/.-]+){2,}",
    )
    raw_path_count = len(_raw_path_re.findall(draft))
    if raw_path_count > 3:
        overall_warnings.append(
            f"Raw metric paths in prose: {raw_path_count} instances of "
            f"\\texttt{{config/path/metric}} style dumps"
        )
        revision_directives.append(
            "REMOVE raw experiment log paths from prose. Replace "
            "\\texttt{config/metric/path} with human-readable metric names "
            "and summarize values in tables, not inline text."
        )

    # --- Writing quality lint ---
    _weasel_words = re.compile(
        r"\b(various|many|several|quite|fairly|really|very|rather|"
        r"somewhat|relatively|arguably|interestingly|importantly|"
        r"it is well known that|it is obvious that|clearly)\b",
        re.IGNORECASE,
    )
    _duplicate_words = re.compile(r"\b(\w+)\s+\1\b", re.IGNORECASE)
    weasel_count = len(_weasel_words.findall(draft))
    dup_matches = _duplicate_words.findall(draft)
    dup_count = len([d for d in dup_matches if d.lower() not in ("that", "had")])
    if weasel_count > 20:
        overall_warnings.append(
            f"High weasel-word count: {weasel_count} instances "
            f"(consider replacing vague words with precise language)"
        )
        revision_directives.append(
            "Replace vague hedging words (various, several, quite, fairly, "
            "rather, somewhat) with precise quantities or remove them."
        )
    if dup_count > 0:
        overall_warnings.append(
            f"Duplicate adjacent words found: {dup_count} instance(s) "
            f"(e.g., 'the the', 'is is')"
        )
        revision_directives.append(
            "Fix duplicate adjacent words (likely typos)."
        )

    # --- AI-slop / boilerplate detection ---
    _BOILERPLATE_PHRASES = [
        "delves into", "delve into", "it is worth noting",
        "it should be noted", "it is important to note",
        "leverage the power of", "leverages the power of",
        "in this paper, we propose", "in this work, we propose",
        "to the best of our knowledge",
        "in the realm of", "in the landscape of",
        "plays a crucial role", "plays a pivotal role",
        "groundbreaking", "cutting-edge", "state-of-the-art",
        "game-changing", "paradigm shift",
        "a myriad of", "a plethora of",
        "aims to bridge the gap", "bridge the gap",
        "shed light on", "sheds light on",
        "pave the way", "paves the way",
        "the advent of", "with the advent of",
        "in recent years", "in recent times",
        "has gained significant attention",
        "has attracted considerable interest",
        "has emerged as a promising",
        "a comprehensive overview",
        "a holistic approach", "holistic understanding",
        "showcasing the efficacy", "demonstrate the efficacy",
        "multifaceted", "underscores the importance",
        "navigate the complexities",
        "harness the potential", "harnessing the power",
        "it is imperative to", "it is crucial to",
        "a nuanced understanding", "nuanced approach",
        "robust and scalable", "seamlessly integrates",
        "the intricacies of", "intricate interplay",
        "facilitate a deeper understanding",
        "a testament to",
    ]
    draft_lower = draft.lower()
    boilerplate_hits: list[str] = []
    for phrase in _BOILERPLATE_PHRASES:
        count = draft_lower.count(phrase)
        if count > 0:
            boilerplate_hits.extend([phrase] * count)
    if len(boilerplate_hits) > 5:
        unique_phrases = sorted(set(boilerplate_hits))[:5]
        overall_warnings.append(
            f"AI boilerplate detected: {len(boilerplate_hits)} instances "
            f"of generic LLM phrases (e.g., {', '.join(repr(p) for p in unique_phrases[:3])})"
        )
        revision_directives.append(
            "REWRITE sentences containing AI-generated boilerplate phrases. "
            "Replace generic language (e.g., 'delves into', 'it is worth noting', "
            "'leverages the power of', 'plays a crucial role', 'paves the way') "
            "with precise, specific academic language."
        )

    # --- Related work depth check ---
    _rw_headings = {"related work", "related works", "background", "literature review"}
    rw_body = ""
    for sec in sections_data:
        if sec["heading_lower"] in _rw_headings and sec["level"] <= 2:
            rw_body = sec["body"]
            break
    if rw_body and len(rw_body.split()) > 50:
        _comparative_pats = re.compile(
            r"\b(unlike|in contrast|whereas|while .+ focus|"
            r"however|differ(?:s|ent)|our (?:method|approach) .+ instead|"
            r"we (?:instead|differ)|compared to|as opposed to|"
            r"goes beyond|extends|improves upon|addresses the limitation)\b",
            re.IGNORECASE,
        )
        sentences = [s.strip() for s in re.split(r"[.!?]+", rw_body) if s.strip()]
        comparative_sents = sum(1 for s in sentences if _comparative_pats.search(s))
        ratio = comparative_sents / len(sentences) if sentences else 0.0
        if ratio < 0.15 and len(sentences) >= 5:
            overall_warnings.append(
                f"Related Work is purely descriptive: only {comparative_sents}/{len(sentences)} "
                f"sentences ({ratio:.0%}) contain comparative language (target: >=15%)"
            )
            revision_directives.append(
                "REWRITE Related Work to critically compare with prior methods. "
                "Use phrases like 'unlike X, our approach...', 'in contrast to...', "
                "'while X focuses on... we address...' for at least 20% of sentences."
            )

    # --- Statistical rigor check (result sections) ---
    _results_headings = {"results", "experiments", "experimental results", "evaluation"}
    results_body = ""
    for sec in sections_data:
        if sec["heading_lower"] in _results_headings and sec["level"] <= 2:
            results_body += sec["body"] + "\n"
    if results_body and len(results_body.split()) > 100:
        has_std = bool(re.search(r"\u00b1|\\pm|\bstd\b|\\std\b|standard deviation", results_body, re.IGNORECASE))
        has_ci = bool(re.search(r"confidence interval|\bCI\b|95%|p-value|p\s*<", results_body, re.IGNORECASE))
        has_seeds = bool(re.search(r"(?:seed|run|trial)s?\s*[:=]\s*\d|averaged?\s+over\s+\d+\s+(?:seed|run|trial)", results_body, re.IGNORECASE))
        if not has_std and not has_ci and not has_seeds:
            overall_warnings.append(
                "No statistical measures found in results (no std, CI, p-values, or multi-seed reporting)"
            )
            revision_directives.append(
                "ADD error bars (\u00b1std), confidence intervals, or note the number of "
                "random seeds used. Single-run results without variance reporting "
                "are insufficient for top venues."
            )

    result: dict[str, Any] = {
        "section_analysis": section_analysis,
        "overall_warnings": overall_warnings,
        "revision_directives": revision_directives,
    }
    if stage_dir is not None:
        (stage_dir / "draft_quality.json").write_text(
            json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8"
        )
        if overall_warnings:
            logger.warning(
                "Draft quality: %d warning(s) \u2014 %s",
                len(overall_warnings),
                "; ".join(overall_warnings[:3]),
            )
        else:
            logger.info("Draft quality: all checks passed")
    return result


def _review_compiled_pdf(
    pdf_path: Path,
    llm: LLMClient,
    topic: str,
) -> dict[str, Any]:
    """Multi-dimensional LLM review of compiled paper (AI-Scientist style).

    Scores the paper on 7 academic review dimensions (1-10 each),
    identifies specific strengths/weaknesses, and provides an overall
    accept/reject recommendation with confidence.

    Returns a dict with dimensional scores, issues, and decision.
    """
    if not pdf_path.exists():
        return {}

    # Use source-based review since not all models support vision
    tex_path = pdf_path.with_suffix(".tex")
    if not tex_path.exists():
        return {}

    tex_content = tex_path.read_text(encoding="utf-8")[:12000]

    review_prompt = (
        "You are a senior Area Chair at a top AI conference (NeurIPS/ICML/ICLR) "
        "reviewing a paper submission. Provide a rigorous, structured review.\n\n"
        f"PAPER TOPIC: {topic}\n\n"
        f"LaTeX source:\n```latex\n{tex_content}\n```\n\n"
        "REVIEW INSTRUCTIONS:\n"
        "Score each dimension 1-10 (1=unacceptable, 5=borderline, 8=strong accept, "
        "10=best paper candidate). Be critical but fair.\n\n"
        "DIMENSIONS:\n"
        "1. SOUNDNESS: Are claims well-supported? Is methodology correct? "
        "Are there logical gaps or unsupported claims?\n"
        "2. PRESENTATION: Is the writing clear, flowing, and professional? "
        "Are there grammar errors, bullet lists in prose sections, or "
        "boilerplate phrases? Is it free of AI-generated slop?\n"
        "3. CONTRIBUTION: Is the contribution significant? Does it advance "
        "the field beyond incremental improvement?\n"
        "4. ORIGINALITY: Is the approach novel? Does it differentiate clearly "
        "from prior work?\n"
        "5. CLARITY: Are the method and results easy to understand? Are figures "
        "and tables well-designed with descriptive captions?\n"
        "6. SIGNIFICANCE: Would the community benefit from this work? Does it "
        "open new research directions?\n"
        "7. REPRODUCIBILITY: Are experimental details sufficient to reproduce "
        "results? Are hyperparameters, datasets, and metrics clearly stated?\n\n"
        "Also evaluate:\n"
        "- Are all figures referenced in the text?\n"
        "- Are tables properly formatted (booktabs style, no vertical rules)?\n"
        "- Does the related work critically compare, not just list papers?\n"
        "- Are statistical measures (std, CI, multiple seeds) reported?\n"
        "- Is there a clear limitations section?\n\n"
        "Return a JSON object:\n"
        "{\n"
        '  "soundness": N,\n'
        '  "presentation": N,\n'
        '  "contribution": N,\n'
        '  "originality": N,\n'
        '  "clarity": N,\n'
        '  "significance": N,\n'
        '  "reproducibility": N,\n'
        '  "overall_score": N,\n'
        '  "confidence": N,\n'
        '  "decision": "accept" or "reject",\n'
        '  "strengths": ["strength1", "strength2", ...],\n'
        '  "weaknesses": ["weakness1", "weakness2", ...],\n'
        '  "critical_issues": ["issue requiring revision", ...],\n'
        '  "minor_issues": ["formatting/typo issues", ...],\n'
        '  "summary": "2-3 sentence overall assessment"\n'
        "}\n"
    )

    try:
        resp = llm.chat(
            messages=[{"role": "user", "content": review_prompt}],
            system=(
                "You are a meticulous, critical academic reviewer. "
                "You have reviewed 100+ papers at top venues. "
                "Score honestly — most papers deserve 4-6, not 7-9. "
                "Flag any sign of AI-generated boilerplate."
            ),
        )
        review_data = _safe_json_loads(resp.content, {})
        if isinstance(review_data, dict) and "overall_score" in review_data:
            # Compute weighted aggregate if individual scores present
            dim_scores = {
                k: review_data.get(k, 0)
                for k in (
                    "soundness", "presentation", "contribution",
                    "originality", "clarity", "significance",
                    "reproducibility",
                )
            }
            valid = {k: v for k, v in dim_scores.items() if isinstance(v, (int, float)) and v > 0}
            if valid:
                review_data["mean_score"] = round(sum(valid.values()) / len(valid), 2)
            return review_data
    except Exception as exc:  # noqa: BLE001
        logger.debug("PDF review LLM call failed: %s", exc)

    return {}


def _check_ablation_effectiveness(
    exp_summary: dict[str, Any],
    threshold: float = 0.02,
) -> list[str]:
    """P7: Check if ablation results are within *threshold* of baseline.

    Returns a list of warning strings for ineffective ablations.
    Threshold tightened from 5% to 2% (Improvement C) — ablations with
    < 2% relative difference AND < 1pp absolute difference are flagged
    as TRIVIAL.
    """
    warnings: list[str] = []
    cond_summaries = exp_summary.get("condition_summaries", {})
    if not isinstance(cond_summaries, dict) or not cond_summaries:
        return warnings

    # Find baseline/control condition
    baseline_name = None
    baseline_mean = None
    for name, data in cond_summaries.items():
        if not isinstance(data, dict):
            continue
        name_lower = name.lower()
        if any(tag in name_lower for tag in ("baseline", "control", "vanilla", "standard")):
            metrics = data.get("metrics") or {}
            if not isinstance(metrics, dict):
                metrics = {}
            # Use the first metric that has a _mean suffix or the first available
            for mk, mv in metrics.items():
                if mk.endswith("_mean"):
                    baseline_name = name
                    baseline_mean = float(mv)
                    break
            if baseline_mean is None:
                for mk, mv in metrics.items():
                    try:
                        baseline_name = name
                        baseline_mean = float(mv)
                        break
                    except (TypeError, ValueError):
                        continue
            if baseline_name:
                break

    if baseline_name is None or baseline_mean is None:
        return warnings

    # Check each ablation condition
    for name, data in cond_summaries.items():
        if not isinstance(data, dict):
            continue
        name_lower = name.lower()
        if name == baseline_name:
            continue
        if not any(tag in name_lower for tag in ("ablation", "no_", "without", "reduced")):
            continue
        metrics = data.get("metrics") or {}
        if not isinstance(metrics, dict):
            metrics = {}
        for mk, mv in metrics.items():
            if not mk.endswith("_mean"):
                continue
            try:
                abl_val = float(mv)
            except (TypeError, ValueError):
                continue
            if baseline_mean != 0:
                rel_diff = abs(abl_val - baseline_mean) / abs(baseline_mean)
            else:
                rel_diff = abs(abl_val - baseline_mean)
            abs_diff = abs(abl_val - baseline_mean)
            # Improvement C: Tighter check — both relative < threshold
            # AND absolute < 1pp → TRIVIAL
            if rel_diff < threshold and abs_diff < 1.0:
                warnings.append(
                    f"TRIVIAL: Ablation '{name}' {mk}={abl_val:.4f} is within "
                    f"{rel_diff:.1%} (abs {abs_diff:.4f}pp) of baseline "
                    f"'{baseline_name}' {mk}={baseline_mean:.4f} — "
                    f"ablation is ineffective"
                )
            elif rel_diff < threshold:
                warnings.append(
                    f"Ablation '{name}' {mk}={abl_val:.4f} is within "
                    f"{rel_diff:.1%} of baseline '{baseline_name}' "
                    f"{mk}={baseline_mean:.4f} — ablation may be ineffective"
                )
            break  # Only check the first _mean metric per condition

    # Improvement C: Prepend CRITICAL summary if >50% trivial
    trivial_count = sum(1 for w in warnings if w.startswith("TRIVIAL:"))
    if trivial_count > 0 and len(warnings) > 0 and trivial_count / len(warnings) > 0.5:
        warnings.insert(0, (
            f"CRITICAL: {trivial_count}/{len(warnings)} ablations are trivially "
            f"similar to baseline (<{threshold:.0%} relative, <1pp absolute). "
            f"The ablation design is likely broken — components are not effectively removed."
        ))

    return warnings


def _detect_result_contradictions(
    exp_summary: dict[str, Any],
    metric_direction: str = "maximize",
) -> list[str]:
    """P10: Detect contradictions in experiment results before paper writing.

    Returns a list of advisory strings to inject into paper writing prompt.
    """
    advisories: list[str] = []
    cond_summaries = exp_summary.get("condition_summaries", {})
    if not isinstance(cond_summaries, dict) or not cond_summaries:
        return advisories

    # Collect primary metric means per condition
    means: dict[str, float] = {}
    for name, data in cond_summaries.items():
        if not isinstance(data, dict):
            continue
        metrics = data.get("metrics", {})
        for mk, mv in metrics.items():
            if mk.endswith("_mean"):
                try:
                    means[name] = float(mv)
                except (TypeError, ValueError):
                    pass
                break

    if len(means) < 2:
        return advisories

    # Check 1: All methods within noise margin (2% relative spread)
    vals = list(means.values())
    val_range = max(vals) - min(vals)
    val_mean = sum(vals) / len(vals)
    if val_mean != 0 and (val_range / abs(val_mean)) < 0.02:
        advisories.append(
            "NULL RESULT: All methods produce nearly identical primary metric values "
            f"(range={val_range:.4f}, mean={val_mean:.4f}). Frame this as a null result — "
            "the methods are statistically indistinguishable. Do NOT claim any method "
            "is superior. Discuss possible explanations (task too easy/hard, metric "
            "insensitive, insufficient differentiation in methods)."
        )

    # Check 2: Control/simple baseline outperforms proposed method
    # BUG-P1: Respect metric_direction — "higher is better" vs "lower is better"
    _maximize = metric_direction == "maximize"
    baseline_val = None
    baseline_name = None
    proposed_val = None
    proposed_name = None
    for name, val in means.items():
        name_lower = name.lower()
        if any(tag in name_lower for tag in ("baseline", "control", "random", "vanilla")):
            if baseline_val is None or (_maximize and val > baseline_val) or (not _maximize and val < baseline_val):
                baseline_val = val
                baseline_name = name
        elif any(tag in name_lower for tag in ("proposed", "our", "novel", "method")):
            if proposed_val is None or (_maximize and val > proposed_val) or (not _maximize and val < proposed_val):
                proposed_val = val
                proposed_name = name

    if baseline_val is not None and proposed_val is not None:
        _baseline_wins = (baseline_val > proposed_val) if _maximize else (baseline_val < proposed_val)
        if _baseline_wins:
            advisories.append(
                f"NEGATIVE RESULT: Baseline '{baseline_name}' ({baseline_val:.4f}) "
                f"outperforms proposed method '{proposed_name}' ({proposed_val:.4f}). "
                "This is a NEGATIVE result. Do NOT claim the proposed method is superior. "
                "Frame as 'An Empirical Study of...' or 'When X Falls Short'. "
                "Discuss why the baseline won and what this implies for future work."
            )

    return advisories


def _execute_paper_draft(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    outline = _read_prior_artifact(run_dir, "outline.md") or ""
    preamble = _build_context_preamble(
        config,
        run_dir,
        include_goal=True,
        include_hypotheses=True,
        include_analysis=True,
        include_experiment_data=True,  # WS-5.1: inject real experiment data
    )

    # BUG-222: Read PROMOTED BEST experiment_summary for the paper prompt.
    # Previous code (R21-1) picked the "richest" experiment_summary across
    # all stage-14* dirs.  After REFINE regression, a later iteration with
    # more conditions but worse quality could win, feeding the LLM regressed
    # data.  Now: prefer experiment_summary_best.json (written by
    # _promote_best_stage14()), fall back to richest stage-14* for
    # non-REFINE runs.
    exp_summary_text = None
    _best_path = run_dir / "experiment_summary_best.json"
    if _best_path.is_file():
        try:
            _text = _best_path.read_text(encoding="utf-8")
            _parsed = _safe_json_loads(_text, {})
            if isinstance(_parsed, dict) and (
                _parsed.get("condition_summaries") or _parsed.get("metrics_summary")
            ):
                exp_summary_text = _text
                logger.info("BUG-222: Using promoted experiment_summary_best.json")
        except OSError:
            pass
    if exp_summary_text is None:
        # Fallback: pick richest stage-14* (pre-BUG-222 behavior)
        _best_metric_count = 0
        for _s14_dir in sorted(run_dir.glob("stage-14*")):
            _candidate = _s14_dir / "experiment_summary.json"
            if _candidate.is_file():
                _text = _candidate.read_text(encoding="utf-8")
                _parsed = _safe_json_loads(_text, {})
                if isinstance(_parsed, dict):
                    _mcount = _parsed.get("total_metric_keys", 0) or len(
                        _parsed.get("metrics_summary", {})
                    )
                    _paired_count = len(_parsed.get("paired_comparisons", []))
                    _cond_count = len(_parsed.get("condition_summaries", {}))
                    _score = _mcount + _paired_count * 10 + _cond_count * 5
                    if _score > _best_metric_count:
                        _best_metric_count = _score
                        exp_summary_text = _text
                        logger.info(
                            "R21-1 fallback: Selected %s (score=%d)",
                            _s14_dir.name, _score,
                        )
        if exp_summary_text is None:
            exp_summary_text = _read_prior_artifact(run_dir, "experiment_summary.json")
    exp_metrics_instruction = ""
    has_real_metrics = False
    _verified_registry = None  # Phase 1: anti-fabrication verified data registry
    # BUG-108: Load refinement_log so VerifiedRegistry has per-iteration metrics
    _refinement_log_for_vr: dict | None = None
    _rl_candidates = sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True)
    _rl_path = _rl_candidates[0] if _rl_candidates else None
    if _rl_path and _rl_path.is_file():
        try:
            _refinement_log_for_vr = json.loads(_rl_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            pass
    if exp_summary_text:
        exp_summary = _safe_json_loads(exp_summary_text, {})
        # Phase 1: Build VerifiedRegistry from experiment data
        if isinstance(exp_summary, dict):
            try:
                from researchclaw.pipeline.verified_registry import VerifiedRegistry
                # BUG-222: Use best_only=True to ensure paper tables reflect
                # only the promoted best iteration, not regressed data
                _verified_registry = VerifiedRegistry.from_run_dir(
                    run_dir,
                    metric_direction=config.experiment.metric_direction,
                    best_only=True,
                )
                logger.info(
                    "Stage 17: VerifiedRegistry — %d verified values, %d conditions",
                    len(_verified_registry.values),
                    len(_verified_registry.condition_names),
                )
            except Exception as _vr_exc:
                logger.warning("Stage 17: Failed to build VerifiedRegistry: %s", _vr_exc)
        if isinstance(exp_summary, dict) and exp_summary.get("metrics_summary"):
            has_real_metrics = True
            exp_metrics_instruction = (
                "\n\nIMPORTANT: Use the ACTUAL experiment results provided in the context. "
                "All numbers in the Results and Experiments sections MUST reference real data. "
                "Do NOT write 'no quantitative results yet' or use placeholder numbers. "
                "Cite specific metrics with their actual values.\n"
            )

    # Collect raw experiment stdout metrics as hard constraint for the paper
    raw_metrics_block, _has_parsed_metrics = _collect_raw_experiment_metrics(run_dir)
    if raw_metrics_block:
        # BUG-23: Raw stdout alone is not sufficient — require either
        # metrics_summary data, parsed metrics from run JSONs,
        # OR at least 3 condition= patterns in raw block
        _has_condition_pattern = len(re.findall(
            r"condition[=:]", raw_metrics_block, re.IGNORECASE
        )) >= 3
        if has_real_metrics or _has_parsed_metrics or _has_condition_pattern:
            has_real_metrics = True
        exp_metrics_instruction += raw_metrics_block

    # R18-1 + R19-6: Inject paired statistical comparisons AND condition summaries
    if exp_summary_text:
        exp_summary_parsed = _safe_json_loads(exp_summary_text, {})
        if isinstance(exp_summary_parsed, dict):
            # R19-6: Inject experiment scale header so LLM knows the data richness
            _total_conds = exp_summary_parsed.get("total_conditions")
            _total_mkeys = exp_summary_parsed.get("total_metric_keys")
            if _total_conds or _total_mkeys:
                scale_block = "\n\n## EXPERIMENT SCALE\n"
                if _total_conds:
                    scale_block += f"- Total conditions tested: {_total_conds}\n"
                if _total_mkeys:
                    scale_block += f"- Total metric keys collected: {_total_mkeys}\n"
                scale_block += (
                    "- This is a MULTI-SEED experiment. Report mean +/- std across seeds.\n"
                    "- Do NOT describe results as 'single run' or 'preliminary'.\n"
                )
                exp_metrics_instruction += scale_block

            # Improvement B: Inject seed insufficiency warnings
            _seed_warns = exp_summary_parsed.get("seed_insufficiency_warnings", [])
            if _seed_warns:
                _sw_block = (
                    "\n\n## SEED INSUFFICIENCY WARNINGS\n"
                    "Some conditions were run with fewer than 3 seeds. "
                    "Results for these conditions MUST be footnoted as preliminary.\n"
                    "All tables MUST show mean ± std format. Single-run values "
                    "MUST be footnoted with '†single seed — interpret with caution'.\n"
                )
                for _sw in _seed_warns:
                    _sw_block += f"- {_sw}\n"
                exp_metrics_instruction += _sw_block

            # R19-6 + R33: Inject condition summaries with CIs
            cond_summaries = exp_summary_parsed.get("condition_summaries", {})
            if isinstance(cond_summaries, dict) and cond_summaries:
                cond_block = "\n\n## PER-CONDITION SUMMARY (use in Results tables)\n"
                for cname, cdata in sorted(cond_summaries.items()):
                    cond_block += f"\n### {cname}\n"
                    if not isinstance(cdata, dict):
                        continue
                    sr = cdata.get("success_rate")
                    if sr is not None:
                        try:
                            cond_block += f"- Success rate: {float(sr):.1%}\n"
                        except (ValueError, TypeError):
                            cond_block += f"- Success rate: {sr}\n"
                    ns = cdata.get("n_seeds") or cdata.get("n_seed_metrics")
                    if ns:
                        cond_block += f"- Seeds: {ns}\n"
                    ci_lo = cdata.get("ci95_low")
                    ci_hi = cdata.get("ci95_high")
                    if ci_lo is not None and ci_hi is not None:
                        try:
                            cond_block += f"- Bootstrap 95% CI: [{float(ci_lo):.4f}, {float(ci_hi):.4f}]\n"
                        except (ValueError, TypeError):
                            cond_block += f"- Bootstrap 95% CI: [{ci_lo}, {ci_hi}]\n"
                    cm = cdata.get("metrics") or {}
                    if isinstance(cm, dict) and cm:
                        for mk, mv in sorted(cm.items()):
                            if isinstance(mv, (int, float)):
                                cond_block += f"- {mk}: {mv:.4f}\n"
                            else:
                                cond_block += f"- {mk}: {mv}\n"
                exp_metrics_instruction += cond_block

            # R18-1: Inject paired statistical comparisons
            paired = exp_summary_parsed.get("paired_comparisons", [])
            if paired:
                paired_block = "\n\n## PAIRED STATISTICAL COMPARISONS (use these in Results)\n"
                paired_block += f"Total: {len(paired)} paired tests computed.\n"
                for pc in paired:
                    if not isinstance(pc, dict):
                        continue
                    method = pc.get("method", "?")
                    baseline = pc.get("baseline", "?")
                    regime = pc.get("regime", "all")
                    md = pc.get("mean_diff", "?")
                    sd = pc.get("std_diff", "?")
                    ts = pc.get("t_stat", "?")
                    pv = pc.get("p_value", "?")
                    ci_lo = pc.get("ci95_low")
                    ci_hi = pc.get("ci95_high")
                    ci_str = ""
                    if ci_lo is not None and ci_hi is not None:
                        try:
                            ci_str = f", 95% CI [{float(ci_lo):.3f}, {float(ci_hi):.3f}]"
                        except (ValueError, TypeError):
                            ci_str = f", 95% CI [{ci_lo}, {ci_hi}]"
                    paired_block += (
                        f"- {method} vs {baseline} (regime={regime}): "
                        f"mean_diff={md}, std_diff={sd}, "
                        f"t={ts}, p={pv}{ci_str}\n"
                    )
                exp_metrics_instruction += paired_block

            # R24: Method naming map — translate generic condition labels
            _cond_names = list(cond_summaries.keys()) if isinstance(cond_summaries, dict) and cond_summaries else []
            if _cond_names:
                naming_block = (
                    "\n\n## METHOD NAMING (CRITICAL — do NOT use generic labels in the paper)\n"
                    "The condition labels below come from the experiment code. In the paper, "
                    "you MUST use DESCRIPTIVE algorithm names, not generic labels.\n"
                    "- If a condition name is already descriptive (e.g., 'random_search', "
                    "'bayesian_optimization', 'ppo_policy'), use it directly as a proper name.\n"
                    "- If a condition name is generic (e.g., 'baseline_1', 'method_variant_1'), "
                    "you MUST infer the algorithm from the experiment code/context and use the "
                    "real algorithm name (e.g., 'Random Search', 'Bayesian Optimization', "
                    "'PPO', 'Curiosity-Driven RL').\n"
                    "- NEVER write `baseline_1` or `method_variant_1` in the paper text.\n"
                    f"- Conditions to name: {_cond_names}\n"
                )
                exp_metrics_instruction += naming_block

            # IMP-8: Inject broken ablation warnings
            abl_warnings = exp_summary_parsed.get("ablation_warnings", [])
            if abl_warnings:
                broken_block = (
                    "\n\n## BROKEN ABLATIONS (DO NOT discuss as valid results)\n"
                    "The following ablation conditions produced IDENTICAL outputs, "
                    "indicating implementation bugs. Do NOT present their differences "
                    "as findings. Mention them ONLY in a 'Limitations' sub-section "
                    "as known implementation issues:\n"
                )
                for _aw in abl_warnings:
                    broken_block += f"- {_aw}\n"
                broken_block += (
                    "\nIf you reference these conditions, state explicitly: "
                    "'Due to an implementation defect, conditions X and Y produced "
                    "identical outputs; their comparison is therefore uninformative.'\n"
                )
                exp_metrics_instruction += broken_block

            # R25: Statistical table format requirement
            if paired:
                stat_table_block = (
                    "\n\n## STATISTICAL TABLE REQUIREMENT (MANDATORY in Results section)\n"
                    "The Results section MUST include a statistical comparison table with columns:\n"
                    "| Comparison | Mean Diff | Std Diff | t-statistic | p-value | Significance |\n"
                    "Use the PAIRED STATISTICAL COMPARISONS data above to fill this table.\n"
                    "Mark significance: *** (p<0.001), ** (p<0.01), * (p<0.05), n.s.\n"
                    "This is non-negotiable — a top-venue paper MUST have statistical tests.\n"
                )
                exp_metrics_instruction += stat_table_block

            # R26: Metric definition requirement
            exp_metrics_instruction += (
                "\n\n## METRIC DEFINITIONS (MANDATORY in Experiments section)\n"
                "The Experiments section MUST define each metric:\n"
                "- **Primary metric**: what it measures, how it is computed, range, direction "
                "(higher/lower is better), and units if applicable.\n"
                "- **Secondary metric**: same details.\n"
                "- For time-to-event metrics: explain the horizon, what constitutes success, "
                "and how failures are handled (e.g., set to max horizon).\n"
                "- These definitions MUST appear BEFORE any results tables.\n"
            )

            # R27: Multi-seed framing enforcement
            _any_seeds = any(
                (cond_summaries.get(c) or {}).get("n_seed_metrics", 0) > 1
                for c in _cond_names
            ) if _cond_names else False
            if _any_seeds:
                exp_metrics_instruction += (
                    "\n\n## MULTI-SEED EXPERIMENT FRAMING (CRITICAL)\n"
                    "This experiment uses MULTIPLE independent random seeds per condition.\n"
                    "- Report mean +/- std (or SE) for all metrics.\n"
                    "- NEVER describe this as 'a single run' or '1 benchmark-artifact run'.\n"
                    "- Frame as: 'We evaluate each method across N seeds per regime.'\n"
                    "- The seed-level data IS the evidence base — it is NOT a single observation.\n"
                    "- Include per-regime breakdowns (easy vs hard) as separate rows in tables.\n"
                )

    # BUG-003: Inject actual evaluated datasets as a hard constraint
    if exp_summary_text:
        _ds_parsed = _safe_json_loads(exp_summary_text, {})
        if isinstance(_ds_parsed, dict):
            _datasets: set[str] = set()
            # Extract from condition names (often contain dataset info)
            for _cname in (_ds_parsed.get("condition_summaries") or {}).keys():
                _datasets.add(str(_cname))
            # Extract from explicit "datasets" field if present
            for _ds in (_ds_parsed.get("datasets") or []):
                if isinstance(_ds, str):
                    _datasets.add(_ds)
            # Extract from "benchmark" or "dataset" fields
            for _key in ("benchmark", "dataset", "dataset_name"):
                _dv = _ds_parsed.get(_key)
                if isinstance(_dv, str) and _dv:
                    _datasets.add(_dv)
            if _datasets:
                exp_metrics_instruction += (
                    "\n\n## ACTUAL EVALUATED DATASETS (HARD CONSTRAINT)\n"
                    "The following datasets/conditions were ACTUALLY tested in experiments:\n"
                    + "".join(f"- {d}\n" for d in sorted(_datasets))
                    + "\nCRITICAL: Do NOT claim evaluation on any dataset not listed above.\n"
                    "Do NOT fabricate results for datasets you did not run experiments on.\n"
                    "If you reference other datasets, clearly state they are 'not evaluated "
                    "in this work' or are 'left for future work'.\n"
                )

    # P7: Ablation effectiveness check
    if exp_summary_text:
        _exp_parsed_p7 = _safe_json_loads(exp_summary_text, {})
        if isinstance(_exp_parsed_p7, dict):
            _abl_warnings = _check_ablation_effectiveness(_exp_parsed_p7)
            if _abl_warnings:
                _abl_block = (
                    "\n\n## ABLATION EFFECTIVENESS WARNINGS\n"
                    "The following ablations showed minimal effect (within 5% of baseline). "
                    "Discuss this honestly — it may indicate the ablated component is not "
                    "important, or the ablation was not properly implemented:\n"
                )
                for _aw in _abl_warnings:
                    _abl_block += f"- {_aw}\n"
                exp_metrics_instruction += _abl_block
                logger.warning("P7: Ablation effectiveness warnings: %s", _abl_warnings)

    # P10: Contradiction detection
    if exp_summary_text:
        _exp_parsed_p10 = _safe_json_loads(exp_summary_text, {})
        if isinstance(_exp_parsed_p10, dict):
            _contradictions = _detect_result_contradictions(
                _exp_parsed_p10, metric_direction=config.experiment.metric_direction
            )
            if _contradictions:
                _contra_block = (
                    "\n\n## RESULT INTERPRETATION ADVISORIES (CRITICAL — read before writing)\n"
                )
                for _ca in _contradictions:
                    _contra_block += f"- {_ca}\n"
                exp_metrics_instruction += _contra_block
                logger.warning("P10: Contradiction advisories: %s", _contradictions)

    # R10: HARD BLOCK — refuse to write paper when all data is simulated
    all_simulated = True
    for stage_subdir in sorted(run_dir.glob("stage-*/runs")):
        for run_file in sorted(stage_subdir.glob("*.json")):
            if run_file.name == "results.json":
                continue
            try:
                _payload = json.loads(run_file.read_text(encoding="utf-8"))
            except (json.JSONDecodeError, OSError):
                continue
            if isinstance(_payload, dict) and _payload.get("status") != "simulated":
                all_simulated = False
                break
        if not all_simulated:
            break

    if all_simulated:
        logger.error(
            "BLOCKED: All experiment data is simulated (mode='simulated'). "
            "Cannot write a paper based on formulaic fake data. "
            "Switch to experiment.mode='sandbox' and re-run."
        )
        (stage_dir / "paper_draft.md").write_text(
            "# Paper Draft Blocked\n\n"
            "**Reason**: All experiment results are from simulated mode "
            "(formulaic data: `0.3 + idx * 0.03`). "
            "These are not real experimental results.\n\n"
            "**Action Required**: Set `experiment.mode: 'sandbox'` in "
            "config.arc.yaml and re-run the pipeline.",
            encoding="utf-8",
        )
        return StageResult(
            stage=Stage.PAPER_DRAFT,
            status=StageStatus.FAILED,
            artifacts=("paper_draft.md",),
            evidence_refs=(),
        )

    # R4-2: HARD BLOCK — refuse to write paper with no real data (ML/empirical domains)
    # For non-empirical domains (math proofs, theoretical economics), allow proceeding
    _domain_id, _domain_name, _domain_venues = _detect_domain(
        config.research.topic, config.research.domains
    )
    _empirical_domains = {"ml", "engineering", "biology", "chemistry"}
    if not has_real_metrics:
        if _domain_id in _empirical_domains:
            logger.error(
                "BLOCKED: Cannot write paper — experiment produced NO metrics. "
                "The pipeline will not fabricate results."
            )
            (stage_dir / "paper_draft.md").write_text(
                "# Paper Draft Blocked\n\n"
                "**Reason**: Experiment stage produced no metrics (status: failed/timeout). "
                "Cannot write a paper without real experimental data.\n\n"
                "**Action Required**: Fix experiment execution or increase time_budget_sec.",
                encoding="utf-8",
            )
            return StageResult(
                stage=Stage.PAPER_DRAFT,
                status=StageStatus.FAILED,
                artifacts=("paper_draft.md",),
                evidence_refs=(),
            )
        else:
            logger.warning(
                "No experiment metrics found, but domain '%s' may be non-empirical "
                "(theoretical/mathematical). Proceeding with paper draft.",
                _domain_name,
            )

    # R11-5: Experiment quality minimum threshold before paper writing
    # Parse analysis.md for quality rating and condition completeness
    analysis_text = _read_best_analysis(run_dir)
    _quality_warnings: list[str] = []

    # Check 1: Was the analysis quality rating very low?
    import re as _re_q
    _rating_match = _re_q.search(
        r"(?:quality\s+rating|result\s+quality)[:\s]*\**(\d+)\s*/\s*10",
        analysis_text,
        _re_q.IGNORECASE,
    )
    if _rating_match:
        _analysis_rating = int(_rating_match.group(1))
        if _analysis_rating <= 3:
            _quality_warnings.append(
                f"Analysis rated experiment quality {_analysis_rating}/10"
            )
        # BUG-23: If quality rating is ≤ 2, force has_real_metrics = False
        # to prevent fabricated results even if stdout had stray numbers.
        # R5-BUG-05: Skip override when _has_parsed_metrics is True — the
        # analysis.md may be stale (from pre-refinement Stage 14) while
        # Stage 13 refinement produced real parsed metrics.
        if _analysis_rating <= 2 and has_real_metrics and not _has_parsed_metrics:
            logger.warning(
                "BUG-23 guard: Analysis quality %d/10 \u2264 2 — "
                "overriding has_real_metrics to False (experiment likely failed)",
                _analysis_rating,
            )
            has_real_metrics = False

    # Check 2: Are baselines missing?
    _analysis_lower = analysis_text.lower()
    if "no" in _analysis_lower and "baseline" in _analysis_lower:
        if any(phrase in _analysis_lower for phrase in [
            "no baseline", "no bo", "no random", "baselines are missing",
            "missing baselines", "baseline coverage is missing",
        ]):
            _quality_warnings.append("Baselines appear to be missing from results")

    # Check 3: Is the metric undefined?
    if any(phrase in _analysis_lower for phrase in [
        "metric is undefined", "primary_metric is undefined",
        "undefined metric", "metric undefined",
    ]):
        _quality_warnings.append("Primary metric is undefined (direction/units/formula unknown)")

    # Check 4: Very few conditions completed
    _condition_count = len(_re_q.findall(
        r"condition[=:\s]+\w+.*?(?:mean|primary_metric)",
        raw_metrics_block or "",
        _re_q.IGNORECASE,
    ))

    if _quality_warnings:
        _warning_block = "\n".join(f"  - {w}" for w in _quality_warnings)
        logger.warning(
            "Stage 17: Experiment quality concerns detected before paper writing:\n%s",
            _warning_block,
        )
        # Inject quality warnings into the paper writing prompt so the LLM
        # writes an appropriately hedged paper
        exp_metrics_instruction += (
            "\n\n## EXPERIMENT QUALITY WARNINGS (address these honestly in the paper)\n"
            + "\n".join(f"- {w}" for w in _quality_warnings)
            + "\n\nBecause of these issues, the paper MUST:\n"
            "- Use hedged language ('preliminary', 'pilot', 'initial exploration')\n"
            "- NOT claim definitive comparisons between methods\n"
            "- Dedicate a substantial Limitations section to these gaps\n"
            "- Frame the contribution as methodology/framework, not empirical findings\n"
        )
        # Save warnings for tracking
        (stage_dir / "quality_warnings.json").write_text(
            json.dumps(_quality_warnings, indent=2), encoding="utf-8"
        )

    # Phase 1: Inject pre-built results tables from VerifiedRegistry
    if _verified_registry is not None:
        try:
            from researchclaw.templates.results_table_builder import (
                build_results_tables,
                build_condition_whitelist,
            )
            _prebuilt_tables = build_results_tables(
                _verified_registry,
                metric_direction=_verified_registry.metric_direction,
            )
            _condition_whitelist = build_condition_whitelist(_verified_registry)
            if _prebuilt_tables:
                _tables_block = "\n\n".join(t.latex_code for t in _prebuilt_tables)
                exp_metrics_instruction += (
                    "\n\n## PRE-BUILT RESULTS TABLES (MANDATORY — copy verbatim)\n"
                    "The tables below were AUTO-GENERATED from verified experiment data.\n"
                    "You MUST include these tables in the Results section EXACTLY as shown.\n"
                    "Do NOT modify any numbers. Do NOT add rows with fabricated data.\n"
                    "You MAY adjust formatting (bold, alignment) but NOT numerical values.\n\n"
                    + _tables_block
                )
                logger.info("Stage 17: Injected pre-built results tables into prompt")
            if _condition_whitelist:
                exp_metrics_instruction += (
                    "\n\n## VERIFIED CONDITIONS (ONLY mention these in the paper)\n"
                    + _condition_whitelist
                    + "\nDo NOT discuss conditions not in this list. Do NOT invent new conditions.\n"
                )
        except Exception as _tb_exc:
            logger.warning("Stage 17: Failed to build pre-built tables: %s", _tb_exc)

    # R4-2: Anti-fabrication data integrity instruction
    exp_metrics_instruction += (
        "\n\n## CRITICAL: Data Integrity Rules\n"
        "- You may ONLY report numbers that appear in the experiment data above\n"
        "- If the experiment data is incomplete (fewer conditions than planned), report\n"
        "  ONLY the conditions that were actually run\n"
        "- Do NOT extrapolate, interpolate, or 'fill in' missing cells in tables\n"
        "- Do NOT invent confidence intervals, p-values, or statistical tests unless\n"
        "  the actual data supports them\n"
        "- If only N conditions completed, simply report results for those N conditions\n"
        "  without repeating apologies or disclaimers about missing conditions\n"
        "- Any table cell without real data must show '\u2014' (not a plausible number)\n"
        "- FORBIDDEN: generating numbers that 'look right' based on your training data\n"
    )

    # IMP-6 + FA: Inject chart references into paper draft prompt
    # Prefer FigureAgent's figure_plan.json (rich descriptions) over raw file scan
    # BUG-FIX: figure_plan.json may be a list (from FigureAgent planner) or a dict
    # (from executor overwrite).  The orchestrator writes a list at planning time;
    # the executor overwrites with a dict only when figure_count > 0.  If the
    # FigureAgent renders 0 charts the list persists, and calling .get() on it
    # raises AttributeError.
    _fa_descriptions = ""
    # BUG-178: Iterate in reverse order so we read the LATEST stage-14
    # iteration's figure plan, matching Stage 22 which copies charts
    # from the newest iteration.
    for _s14_dir in sorted(run_dir.glob("stage-14*"), reverse=True):
        # Prefer the final plan (dict with figure_descriptions) if it exists
        for _fp_name in ("figure_plan_final.json", "figure_plan.json"):
            _fp_path = _s14_dir / _fp_name
            if not _fp_path.exists():
                continue
            try:
                _fp_data = json.loads(_fp_path.read_text(encoding="utf-8"))
                if isinstance(_fp_data, dict):
                    _fa_descriptions = _fp_data.get("figure_descriptions", "")
                elif isinstance(_fp_data, list) and _fp_data:
                    # List format from FigureAgent planner — synthesize descriptions
                    _desc_parts = ["## PLANNED FIGURES (from figure plan)\n"]
                    for _fig in _fp_data:
                        if isinstance(_fig, dict):
                            _fid = _fig.get("figure_id", "unnamed")
                            _ftitle = _fig.get("title", "")
                            _fcap = _fig.get("caption", "")
                            _fsec = _fig.get("section", "results")
                            _desc_parts.append(
                                f"- **{_fid}** ({_fsec}): {_ftitle}\n  {_fcap}"
                            )
                    if len(_desc_parts) > 1:
                        _fa_descriptions = "\n".join(_desc_parts)
            except (json.JSONDecodeError, OSError):
                pass
            if _fa_descriptions:
                break
        if _fa_descriptions:
            break

    if _fa_descriptions:
        exp_metrics_instruction += "\n\n" + _fa_descriptions
        logger.info("Stage 17: Injected FigureAgent figure descriptions into paper draft prompt")
    else:
        # Fallback: scan for chart files from the LATEST stage-14 iteration
        # BUG-178: Must use reverse order to match Stage 22 chart copy behavior
        _chart_files: list[str] = []
        for _s14_dir in sorted(run_dir.glob("stage-14*"), reverse=True):
            _charts_path = _s14_dir / "charts"
            if _charts_path.is_dir():
                _found = sorted(_charts_path.glob("*.png"))
                if _found:
                    _chart_files = [f.name for f in _found]
                    break  # Use only the latest iteration's charts
        if _chart_files:
            _chart_block = (
                "\n\n## AVAILABLE FIGURES (embed in the paper)\n"
                "The following figures were generated from actual experiment data. "
                "You MUST reference at least 1-2 of these in the Results section "
                "using markdown image syntax: `![Caption](charts/filename.png)`\n\n"
            )
            for _cf_name in _chart_files:
                _label = _cf_name.replace("_", " ").replace(".png", "").title()
                _chart_block += f"- `charts/{_cf_name}` \u2014 {_label}\n"
            _chart_block += (
                "\nFor each figure referenced, write a descriptive caption and "
                "discuss what the figure shows in 2-3 sentences.\n"
            )
            exp_metrics_instruction += _chart_block
            logger.info(
                "Stage 17: Injected %d chart references into paper draft prompt",
                len(_chart_files),
            )

    # WS-5.5: Framework diagram placeholder instruction
    exp_metrics_instruction += (
        "\n\n## FRAMEWORK DIAGRAM PLACEHOLDER\n"
        "In the Method/Approach section, include a placeholder for the methodology "
        "framework overview figure. Insert this exactly:\n\n"
        "```\n"
        "![Framework Overview](charts/framework_diagram.png)\n"
        "**Figure N.** Overview of the proposed methodology. "
        "[A detailed framework diagram will be generated separately and inserted here.]\n"
        "```\n\n"
        "This figure should be referenced in the text as 'Figure N' and discussed briefly "
        "(1-2 sentences describing the overall pipeline/architecture flow). "
        "The actual image will be generated post-hoc using a text-to-image model.\n"
    )

    # P5: Extract hyperparameters from results.json for paper Method section
    _hp_table = ""
    for _s14_dir in sorted(run_dir.glob("stage-14*")):
        for _run_file in sorted(_s14_dir.glob("runs/*.json")):
            try:
                _run_data = json.loads(_run_file.read_text(encoding="utf-8"))
                if isinstance(_run_data, dict) and _run_data.get("hyperparameters"):
                    _hp = _run_data["hyperparameters"]
                    if isinstance(_hp, dict) and _hp:
                        _hp_table = "\n\n## HYPERPARAMETERS (include as a table in the Method section)\n"
                        _hp_table += "| Hyperparameter | Value |\n|---|---|\n"
                        for _hk, _hv in sorted(_hp.items()):
                            _hp_table += f"| {_hk} | {_hv} |\n"
                        _hp_table += (
                            "\nThis table MUST appear in the Method/Experiments section. "
                            "Include ALL hyperparameters used, with justification for key choices.\n"
                        )
                        break
            except (json.JSONDecodeError, OSError):
                continue
        if _hp_table:
            break
    # Also check staging dirs for results.json
    if not _hp_table:
        for _staging_dir in sorted(run_dir.glob("stage-*/runs/_docker_*")):
            _rjson = _staging_dir / "results.json"
            if _rjson.is_file():
                try:
                    _rdata = json.loads(_rjson.read_text(encoding="utf-8"))
                    if isinstance(_rdata, dict) and _rdata.get("hyperparameters"):
                        _hp = _rdata["hyperparameters"]
                        if isinstance(_hp, dict) and _hp:
                            _hp_table = "\n\n## HYPERPARAMETERS (include as a table in the Method section)\n"
                            _hp_table += "| Hyperparameter | Value |\n|---|---|\n"
                            for _hk, _hv in sorted(_hp.items()):
                                _hp_table += f"| {_hk} | {_hv} |\n"
                            _hp_table += (
                                "\nThis table MUST appear in the Method/Experiments section. "
                                "Include ALL hyperparameters used, with justification for key choices.\n"
                            )
                            break
                except (json.JSONDecodeError, OSError):
                    continue
    if _hp_table:
        exp_metrics_instruction += _hp_table

    # F2.6: Build citation list from references.bib / candidates with cite_keys
    citation_instruction = ""
    bib_text = _read_prior_artifact(run_dir, "references.bib")

    # P3: Pre-verify citations before paper draft — remove hallucinated refs
    if bib_text and bib_text.strip():
        from researchclaw.literature.verify import (
            filter_verified_bibtex,
            verify_citations as _verify_cit,
        )
        try:
            _pre_report = _verify_cit(bib_text, inter_verify_delay=0.5)
            _kept = _pre_report.verified + _pre_report.suspicious
            _removed = _pre_report.hallucinated
            if _removed > 0:
                bib_text = filter_verified_bibtex(
                    bib_text, _pre_report, include_suspicious=True
                )
                (stage_dir / "references_preverified.bib").write_text(
                    bib_text, encoding="utf-8"
                )
                logger.info(
                    "P3: Pre-verification kept %d/%d citations (removed %d hallucinated)",
                    _kept, _pre_report.total, _removed,
                )
        except Exception as exc:
            logger.warning("P3: Pre-verification failed, using original bib: %s", exc)

    candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl")
    if candidates_text:
        cite_lines: list[str] = []
        for row_text in candidates_text.strip().splitlines():
            row = _safe_json_loads(row_text, {})
            if isinstance(row, dict) and row.get("cite_key"):
                authors_info = ""
                if isinstance(row.get("authors"), list) and row["authors"]:
                    first_author = row["authors"][0]
                    if isinstance(first_author, dict):
                        # BUG-38: name may be non-str (tuple/list) — force str
                        _name = first_author.get("name", "")
                        authors_info = _name if isinstance(_name, str) else str(_name)
                    elif isinstance(first_author, str):
                        authors_info = first_author
                    if len(row["authors"]) > 1:
                        authors_info += " et al."
                title = row.get("title", "")
                cite_lines.append(
                    f"- [{row['cite_key']}] \u2192 TITLE: \"{title}\" "
                    f"| {authors_info} "
                    f"({row.get('venue', '')}, {row.get('year', '')}, "
                    f"cited {row.get('citation_count', 0)} times) "
                    f"| ONLY cite this key when discussing: {title}"
                )
        if cite_lines:
            citation_instruction = (
                "\n\nAVAILABLE REFERENCES (use [cite_key] to cite in the text):\n"
                + "\n".join(cite_lines)
                + "\n\nCRITICAL CITATION RULES:\n"
                "- In the body text, cite using [cite_key] format, e.g. [smith2024transformer].\n"
                "- Do NOT write a References section \u2014 it will be auto-generated from the bibliography file.\n"
                "- Do NOT invent any references or arXiv IDs not in the above list.\n"
                "- You may cite a subset, but NEVER fabricate citations or change arXiv IDs.\n"
                "- SEMANTIC MATCHING: Before citing a reference, verify that its TITLE matches\n"
                "  the concept you are discussing. Do NOT use an unrelated cite_key just\n"
                "  because it sounds similar.\n"
                "- If no reference in the list matches the concept you want to cite,\n"
                "  write 'prior work has shown...' WITHOUT a citation, rather than using\n"
                "  a mismatched reference.\n"
                "- Each [cite_key] MUST correspond to the paper whose title is shown\n"
                "  next to that key in the list above. Cross-check before citing.\n"
                "\nCITATION QUANTITY & QUALITY CONSTRAINTS:\n"
                "- Cite 25-40 unique references in the paper body. The Related Work\n"
                "  section alone should cite at least 15 references.\n"
                "- Every citation MUST be directly relevant to the paper's topic.\n"
                "- DO NOT cite papers from unrelated domains (wireless communication, "
                "manufacturing, UAV, etc.).\n"
                "- Prefer well-known, highly-cited papers over obscure ones.\n"
                "- If unsure whether a paper exists or is relevant, DO NOT cite it.\n"
            )

    if llm is not None:
        _pm = prompts or PromptManager()
        topic_constraint = _pm.block("topic_constraint", topic=config.research.topic)

        # --- Section-by-section writing (3 calls) for conference-grade depth ---
        draft = _write_paper_sections(
            llm=llm,
            pm=_pm,
            run_dir=run_dir,
            preamble=preamble,
            topic_constraint=topic_constraint,
            exp_metrics_instruction=exp_metrics_instruction,
            citation_instruction=citation_instruction,
            outline=outline,
            model_name=config.llm.primary_model,
        )

        # R7: Strip LLM-generated References section — it often fabricates arXiv IDs.
        import re as _re_r7
        ref_pattern = _re_r7.compile(
            r'^(#{1,2}\s*References.*)', _re_r7.MULTILINE | _re_r7.DOTALL
        )
        ref_match = ref_pattern.search(draft)
        if ref_match:
            draft = draft[:ref_match.start()].rstrip()
            logger.info("Stage 17: Stripped LLM-generated References section (R7 fix)")
    else:
        # Build template with real data if available
        results_section = "Template results summary."
        if exp_summary_text:
            exp_summary = _safe_json_loads(exp_summary_text, {})
            if isinstance(exp_summary, dict) and exp_summary.get("metrics_summary"):
                lines = ["Experiment results:"]
                for mk, mv in exp_summary["metrics_summary"].items():
                    if isinstance(mv, dict):
                        lines.append(
                            f"- {mk}: mean={mv.get('mean')}, min={mv.get('min')}, "
                            f"max={mv.get('max')}, n={mv.get('count')}"
                        )
                results_section = "\n".join(lines)

        draft = f"""# Draft Title

## Abstract
Template draft abstract.

## Introduction
Template introduction for {config.research.topic}.

## Related Work
Template related work.

## Method
Template method description.

## Experiments
Template experimental setup.

## Results
{results_section}

## Limitations
Template limitations.

## Conclusion
Template conclusion.

## References
Template references.

Generated: {_utcnow_iso()}
"""
    (stage_dir / "paper_draft.md").write_text(draft, encoding="utf-8")

    # Validate draft quality (section balance + bullet density)
    _validate_draft_quality(draft, stage_dir=stage_dir)

    return StageResult(
        stage=Stage.PAPER_DRAFT,
        status=StageStatus.DONE,
        artifacts=("paper_draft.md",),
        evidence_refs=("stage-17/paper_draft.md",),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_review_publish.py
================================================
"""Stages 18-23: Peer review, paper revision, quality gate, knowledge archive, export/publish, and citation verify."""

from __future__ import annotations

import json
import logging
import math
import re
from collections import Counter
from pathlib import Path
from typing import Any

import yaml  # noqa: F401 — available for downstream use

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain  # noqa: F401
from researchclaw.pipeline._helpers import (
    StageResult,
    _build_context_preamble,
    _chat_with_prompt,
    _collect_experiment_results,  # noqa: F401
    _default_quality_report,
    _extract_paper_title,
    _find_prior_file,
    _generate_framework_diagram_prompt,
    _generate_neurips_checklist,
    _get_evolution_overlay,
    _read_best_analysis,
    _read_prior_artifact,
    _safe_json_loads,
    _topic_constraint_block,  # noqa: F401
    _utcnow_iso,
    reconcile_figure_refs,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Helpers imported from executor.py (not yet moved to _helpers.py).
# Lazy-imported inside functions to avoid circular import when executor.py
# imports this module.
# ---------------------------------------------------------------------------


def _get_collect_raw_experiment_metrics():
    from researchclaw.pipeline.stage_impls._paper_writing import _collect_raw_experiment_metrics
    return _collect_raw_experiment_metrics


def _get_review_compiled_pdf():
    from researchclaw.pipeline.stage_impls._paper_writing import _review_compiled_pdf
    return _review_compiled_pdf


# ---------------------------------------------------------------------------
# _collect_experiment_evidence
# ---------------------------------------------------------------------------

def _collect_experiment_evidence(run_dir: Path) -> str:
    """Collect actual experiment parameters and results for peer review."""
    evidence_parts: list[str] = []

    # 1. Read experiment code to find actual trial count, methods used
    exp_dir = _read_prior_artifact(run_dir, "experiment/")
    if exp_dir and Path(exp_dir).is_dir():
        main_py = Path(exp_dir) / "main.py"
        if main_py.exists():
            code = main_py.read_text(encoding="utf-8")
            evidence_parts.append(f"### Actual Experiment Code (main.py)\n```python\n{code[:3000]}\n```")

    # 2. Read sandbox run results (actual metrics, runtime, stderr)
    runs_text = _read_prior_artifact(run_dir, "runs/")
    if runs_text and Path(runs_text).is_dir():
        for run_file in sorted(Path(runs_text).glob("*.json"))[:5]:
            payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {})
            if isinstance(payload, dict):
                summary = {
                    "metrics": payload.get("metrics"),
                    "elapsed_sec": payload.get("elapsed_sec"),
                    "timed_out": payload.get("timed_out"),
                }
                stderr = payload.get("stderr", "")
                if stderr:
                    summary["stderr_excerpt"] = stderr[:500]
                evidence_parts.append(
                    f"### Run Result: {run_file.name}\n```json\n{json.dumps(summary, indent=2)}\n```"
                )

    # 3. Read refinement log for actual iteration count
    refine_log_text = _read_prior_artifact(run_dir, "refinement_log.json")
    if refine_log_text:
        try:
            rlog = json.loads(refine_log_text)
            summary = {
                "iterations_executed": len(rlog.get("iterations", [])),
                "converged": rlog.get("converged"),
                "stop_reason": rlog.get("stop_reason"),
                "best_metric": rlog.get("best_metric"),
            }
            evidence_parts.append(
                f"### Refinement Summary\n```json\n{json.dumps(summary, indent=2)}\n```"
            )
        except (json.JSONDecodeError, TypeError):
            pass

    # 4. Count actual number of experiment runs
    actual_run_count = 0
    for stage_subdir in sorted(run_dir.glob("stage-*/runs")):
        for rf in stage_subdir.glob("*.json"):
            if rf.name != "results.json":
                actual_run_count += 1
    if actual_run_count > 0:
        evidence_parts.append(
            f"### Actual Trial Count\n"
            f"**The experiment was executed {actual_run_count} time(s).** "
            f"If the paper claims a different number of trials, this is a CRITICAL discrepancy."
        )

    if not evidence_parts:
        return ""

    return (
        "\n\n## Actual Experiment Evidence\n"
        "Use the evidence below to verify the paper's methodology claims.\n\n"
        + "\n\n".join(evidence_parts)
    )


# ---------------------------------------------------------------------------
# Stage 18: Peer Review
# ---------------------------------------------------------------------------

def _execute_peer_review(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    draft = _read_prior_artifact(run_dir, "paper_draft.md") or ""
    experiment_evidence = _collect_experiment_evidence(run_dir)

    # Load draft quality warnings from Stage 17 (if available)
    _quality_suffix = ""
    _quality_json_path = _find_prior_file(run_dir, "draft_quality.json")
    if _quality_json_path and _quality_json_path.exists():
        try:
            _dq = json.loads(_quality_json_path.read_text(encoding="utf-8"))
            _dq_warnings = _dq.get("overall_warnings", [])
            if _dq_warnings:
                _quality_suffix = (
                    "\n\nAUTOMATED QUALITY ISSUES (flag these in your review):\n"
                    + "\n".join(f"- {w}" for w in _dq_warnings)
                    + "\n"
                )
        except Exception:  # noqa: BLE001
            pass

    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "peer_review")
        sp = _pm.for_stage(
            "peer_review",
            evolution_overlay=_overlay,
            topic=config.research.topic,
            draft=draft,
            experiment_evidence=experiment_evidence,
        )
        _review_user = sp.user + _quality_suffix
        resp = _chat_with_prompt(
            llm,
            sp.system,
            _review_user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        reviews = resp.content
    else:
        reviews = """# Reviews

## Reviewer A
- Strengths: Clear problem statement.
- Weaknesses: Limited ablation details.
- Actionable revisions: Add uncertainty analysis and stronger baselines.

## Reviewer B
- Strengths: Reproducibility focus.
- Weaknesses: Discussion underdeveloped.
- Actionable revisions: Expand limitations and broader impact.
"""
    (stage_dir / "reviews.md").write_text(reviews, encoding="utf-8")
    return StageResult(
        stage=Stage.PEER_REVIEW,
        status=StageStatus.DONE,
        artifacts=("reviews.md",),
        evidence_refs=("stage-18/reviews.md",),
    )


# ---------------------------------------------------------------------------
# Stage 19: Paper Revision
# ---------------------------------------------------------------------------

def _execute_paper_revision(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    draft = _read_prior_artifact(run_dir, "paper_draft.md") or ""
    reviews = _read_prior_artifact(run_dir, "reviews.md") or ""
    draft_word_count = len(draft.split())

    # R4-2: Collect real metrics for anti-fabrication guard in revision
    # BUG-47: _collect_raw_experiment_metrics returns tuple[str, bool], must unpack
    _raw_metrics_tuple = _get_collect_raw_experiment_metrics()(run_dir)
    raw_metrics_revision = _raw_metrics_tuple[0] if isinstance(_raw_metrics_tuple, tuple) else (_raw_metrics_tuple or "")
    data_integrity_revision = ""
    if raw_metrics_revision:
        data_integrity_revision = (
            raw_metrics_revision
            + "\nDATA INTEGRITY: Do NOT add new numbers that are not in the "
            "experiment data above. If a reviewer asks for additional results "
            "you do not have, state 'Due to computational constraints, "
            "this analysis was not conducted' instead of fabricating data.\n"
        )

    if llm is not None:
        _pm = prompts or PromptManager()
        try:
            _ws_revision = _pm.block("writing_structure")
        except (KeyError, Exception):  # noqa: BLE001
            _ws_revision = ""
        # IMP-20/25/31/24: Load style blocks for revision prompt
        _rev_blocks: dict[str, str] = {}
        for _bname in ("academic_style_guide", "narrative_writing_rules",
                        "anti_hedging_rules", "anti_repetition_rules"):
            try:
                _rev_blocks[_bname] = _pm.block(_bname)
            except (KeyError, Exception):  # noqa: BLE001
                _rev_blocks[_bname] = ""
        # Load draft quality directives from Stage 17
        _quality_prefix = ""
        _quality_json_path = _find_prior_file(run_dir, "draft_quality.json")
        if _quality_json_path and _quality_json_path.exists():
            try:
                _dq = json.loads(_quality_json_path.read_text(encoding="utf-8"))
                _dq_directives = _dq.get("revision_directives", [])
                if _dq_directives:
                    _quality_prefix = (
                        "MANDATORY QUALITY FIXES (address ALL of these):\n"
                        + "\n".join(f"- {d}" for d in _dq_directives)
                        + "\n\n"
                    )
            except Exception:  # noqa: BLE001
                pass

        _overlay = _get_evolution_overlay(run_dir, "paper_revision")
        sp = _pm.for_stage(
            "paper_revision",
            evolution_overlay=_overlay,
            topic_constraint=_pm.block("topic_constraint", topic=config.research.topic),
            writing_structure=_ws_revision,
            draft=draft,
            reviews=_quality_prefix + reviews + data_integrity_revision,
            **_rev_blocks,
        )
        # R10-Fix2: Ensure max_tokens is sufficient for full paper revision
        revision_max_tokens = sp.max_tokens
        if revision_max_tokens and draft_word_count > 0:
            # ~1.5 tokens per word, 20% headroom
            min_tokens_needed = int(draft_word_count * 1.5 * 1.2)
            if revision_max_tokens < min_tokens_needed:
                revision_max_tokens = min_tokens_needed
                logger.info(
                    "Stage 19: Increased max_tokens from %d to %d to fit full paper revision",
                    sp.max_tokens,
                    revision_max_tokens,
                )

        # R10-Fix4: Retry on timeout for paper revision (critical stage)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=revision_max_tokens,
            retries=2,
        )
        revised = resp.content
        revised_word_count = len(revised.split())
        # Length guard: if revision is shorter than 80% of draft, retry once
        if draft_word_count > 500 and revised_word_count < int(draft_word_count * 0.8):
            logger.warning(
                "Paper revision (%d words) is shorter than draft (%d words). "
                "Retrying with stronger length enforcement.",
                revised_word_count,
                draft_word_count,
            )
            retry_user = (
                f"CRITICAL LENGTH REQUIREMENT: The draft is {draft_word_count} words. "
                f"Your revision MUST be at least {draft_word_count} words — ideally longer. "
                f"Do NOT summarize or condense ANY section. Copy each section verbatim "
                f"and ONLY make targeted improvements to address reviewer comments. "
                f"If a section has no reviewer comments, include it UNCHANGED.\n\n"
                + sp.user
            )
            resp2 = _chat_with_prompt(
                llm, sp.system, retry_user,
                json_mode=sp.json_mode, max_tokens=revision_max_tokens,
            )
            revised2 = resp2.content
            revised2_word_count = len(revised2.split())
            if revised2_word_count >= int(draft_word_count * 0.8):
                revised = revised2
            elif revised2_word_count > revised_word_count:
                # Retry improved but still not enough — use the longer version
                revised = revised2
                logger.warning(
                    "Retry improved (%d → %d words) but still shorter than draft (%d).",
                    revised_word_count,
                    revised2_word_count,
                    draft_word_count,
                )
            else:
                # Both attempts produced short output — preserve full original draft
                logger.warning(
                    "Retry also produced short output (%d words). "
                    "Falling back to FULL ORIGINAL DRAFT to prevent content loss.",
                    revised2_word_count,
                )
                # Extract useful revision points as appendix
                revision_words = revised.split()
                revision_summary = (
                    " ".join(revision_words[:500]) + "\n\n*(Revision summary truncated)*"
                    if len(revision_words) > 500
                    else revised
                )
                if revision_summary.strip():
                    # Save revision notes to internal file, not paper body
                    (stage_dir / "revision_notes_internal.md").write_text(
                        revision_summary, encoding="utf-8"
                    )
                revised = draft
    else:
        revised = draft
    (stage_dir / "paper_revised.md").write_text(revised, encoding="utf-8")
    return StageResult(
        stage=Stage.PAPER_REVISION,
        status=StageStatus.DONE,
        artifacts=("paper_revised.md",),
        evidence_refs=("stage-19/paper_revised.md",),
    )


# ---------------------------------------------------------------------------
# Stage 20: Quality Gate
# ---------------------------------------------------------------------------

def _execute_quality_gate(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    revised = _read_prior_artifact(run_dir, "paper_revised.md") or ""
    report: dict[str, Any] | None = None

    # BUG-25 + BUG-180: Load the RICHEST experiment summary for cross-checking.
    # _read_prior_artifact returns the first match in reverse-sorted order,
    # which may be a repair stage with 0 conditions.  Instead, scan all
    # stage-14* experiment summaries and pick the one with the most data.
    _exp_summary: dict[str, Any] = {}
    _exp_summary_text = ""
    _best_richness = -1
    for _es_path in sorted(run_dir.glob("stage-14*/experiment_summary.json")):
        try:
            _es_text = _es_path.read_text(encoding="utf-8")
            _es_data = _safe_json_loads(_es_text, {})
            if not isinstance(_es_data, dict):
                continue
            _richness = len(_es_data.get("condition_summaries", {}))
            if _richness > _best_richness:
                _best_richness = _richness
                _exp_summary = _es_data
                _exp_summary_text = _es_text
        except OSError:
            continue
    # Also check experiment_summary_best.json at run root
    _root_best = run_dir / "experiment_summary_best.json"
    if _root_best.is_file():
        try:
            _rb_text = _root_best.read_text(encoding="utf-8")
            _rb_data = _safe_json_loads(_rb_text, {})
            if isinstance(_rb_data, dict):
                _rb_rich = len(_rb_data.get("condition_summaries", {}))
                if _rb_rich > _best_richness:
                    _exp_summary = _rb_data
                    _exp_summary_text = _rb_text
        except OSError:
            pass
    # Fallback to _read_prior_artifact if nothing found above
    if not _exp_summary:
        _exp_summary_text = _read_prior_artifact(run_dir, "experiment_summary.json") or ""
        _exp_summary = _safe_json_loads(_exp_summary_text, {}) if _exp_summary_text else {}

    _exp_failed = False
    if isinstance(_exp_summary, dict):
        _best_run = _exp_summary.get("best_run", {})
        if isinstance(_best_run, dict):
            _exp_failed = (
                _best_run.get("status") == "failed"
                and not _best_run.get("metrics")
            )
        # Also check if metrics_summary is empty
        if not _exp_summary.get("metrics_summary"):
            _exp_failed = True
        # BUG-180: If we found real condition data, don't mark as failed
        if _best_richness > 0:
            _exp_failed = False

    if llm is not None:
        _pm = prompts or PromptManager()
        # IMP-33: Evaluate the full paper instead of truncating to 12K chars.
        # Split into chunks if very long, but prefer sending the full text.
        paper_for_eval = revised[:40000] if len(revised) > 40000 else revised

        # BUG-25: Inject experiment status into quality gate prompt
        _exp_context = ""
        if _exp_summary and isinstance(_exp_summary, dict):
            _exp_status_keys = {
                k: _exp_summary.get(k) for k in (
                    "total_conditions", "total_metric_keys",
                    "metrics_summary",
                ) if _exp_summary.get(k) is not None
            }
            # BUG-180: Include condition count from condition_summaries
            _cond_summ = _exp_summary.get("condition_summaries", {})
            if isinstance(_cond_summ, dict) and _cond_summ:
                _exp_status_keys["completed_conditions"] = len(_cond_summ)
                _exp_status_keys["condition_names"] = list(_cond_summ.keys())[:20]
            if _best_run := _exp_summary.get("best_run"):
                _exp_status_keys["best_run_status"] = (
                    _best_run.get("status") if isinstance(_best_run, dict) else str(_best_run)
                )
            _exp_context = (
                "\n\nExperiment summary (for cross-checking reported numbers):\n"
                + json.dumps(_exp_status_keys, indent=2, default=str)[:4000]
                + "\n\nCross-check: If the experiment status is 'failed' with "
                "empty metrics, any numerical results in tables constitute "
                "fabrication. Penalize severely.\n"
            )

        _overlay = _get_evolution_overlay(run_dir, "quality_gate")
        sp = _pm.for_stage(
            "quality_gate",
            evolution_overlay=_overlay,
            quality_threshold=str(config.research.quality_threshold),
            revised=paper_for_eval + _exp_context,
        )
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        parsed = _safe_json_loads(resp.content, {})
        if isinstance(parsed, dict):
            report = parsed
    # BUG-25: If experiment failed with no metrics, cap the quality score
    if report is not None and _exp_failed:
        _orig_score = report.get("score_1_to_10", 5)
        if isinstance(_orig_score, (int, float)) and _orig_score > 3:
            report["score_1_to_10"] = min(_orig_score, 3.0)
            report.setdefault("weaknesses", []).append(
                "Experiment failed with no metrics — any reported numerical "
                "results are unsupported and likely fabricated."
            )
            logger.warning(
                "BUG-25: Experiment failed — capping quality score from %.1f to 3.0",
                _orig_score,
            )
    if report is None:
        report = _default_quality_report(config.research.quality_threshold)
    report.setdefault("generated", _utcnow_iso())
    (stage_dir / "quality_report.json").write_text(
        json.dumps(report, indent=2), encoding="utf-8"
    )

    # T2.1: Enforce quality gate — fail if score below threshold
    score = report.get("score_1_to_10", 0)
    # BUG-R5-01: score can be string from LLM JSON — coerce to float
    if not isinstance(score, (int, float)):
        try:
            score = float(score)
        except (TypeError, ValueError):
            score = 0
    verdict = report.get("verdict", "proceed")
    threshold = config.research.quality_threshold or 5.0

    # --- Fabrication flag: collect real metrics for Stage 22 sanitization ---
    _fabrication_info: dict[str, Any] = {
        "experiment_failed": _exp_failed,
        "quality_score": score,
        "real_metric_values": [],
    }
    if isinstance(_exp_summary, dict):
        # Collect ALL real numeric values from experiment_summary.json
        _cond_summaries = _exp_summary.get("condition_summaries", {})
        if isinstance(_cond_summaries, dict):
            for cond_name, cond_data in _cond_summaries.items():
                if not isinstance(cond_data, dict):
                    continue
                cond_status = cond_data.get("status", "")
                if cond_status == "failed":
                    continue  # skip failed conditions
                for k, v in cond_data.items():
                    if isinstance(v, (int, float)) and k not in (
                        "seed_count", "total_steps", "training_steps",
                    ):
                        _fabrication_info["real_metric_values"].append(
                            round(float(v), 4)
                        )
        _ms = _exp_summary.get("metrics_summary", {})
        if isinstance(_ms, dict):
            for _mk, _mv in _ms.items():
                if isinstance(_mv, dict):
                    for _stat in ("mean", "min", "max"):
                        _sv = _mv.get(_stat)
                        if isinstance(_sv, (int, float)):
                            _fabrication_info["real_metric_values"].append(
                                round(float(_sv), 4)
                            )
    _fabrication_info["has_real_data"] = bool(
        _fabrication_info["real_metric_values"]
    )
    _fabrication_info["fabrication_suspected"] = (
        _exp_failed and not _fabrication_info["has_real_data"]
    )
    # Phase 1: Enhanced fabrication detection via VerifiedRegistry
    # BUG-108: Also pass refinement_log so NaN best_metric is properly handled
    _rl20_candidates = sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True)
    _rl20_path = _rl20_candidates[0] if _rl20_candidates else None
    _rl20: dict | None = None
    if _rl20_path and _rl20_path.is_file():
        try:
            _rl20 = json.loads(_rl20_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            pass
    try:
        from researchclaw.pipeline.verified_registry import VerifiedRegistry as _VR20
        _vr20 = _VR20.from_run_dir(run_dir, metric_direction=config.experiment.metric_direction, best_only=True) if isinstance(_exp_summary, dict) else None
        if _vr20:
            _fabrication_info["verified_values_count"] = len(_vr20.values)
            _fabrication_info["verified_conditions"] = sorted(_vr20.condition_names)
    except Exception:
        pass
    (stage_dir / "fabrication_flags.json").write_text(
        json.dumps(_fabrication_info, indent=2), encoding="utf-8"
    )

    if isinstance(score, (int, float)) and score < threshold:
        if config.research.graceful_degradation:
            logger.warning(
                "Quality gate DEGRADED: score %.1f < threshold %.1f — "
                "continuing with sanitization (graceful_degradation=True)",
                score, threshold,
            )
            # Write degradation signal for downstream stages
            signal = {
                "score": score,
                "threshold": threshold,
                "verdict": verdict,
                "weaknesses": report.get("weaknesses", []),
                "generated": _utcnow_iso(),
            }
            (run_dir / "degradation_signal.json").write_text(
                json.dumps(signal, indent=2), encoding="utf-8"
            )
            return StageResult(
                stage=Stage.QUALITY_GATE,
                status=StageStatus.DONE,
                artifacts=("quality_report.json",),
                evidence_refs=("stage-20/quality_report.json",),
                decision="degraded",
            )
        logger.warning(
            "Quality gate FAILED: score %.1f < threshold %.1f (verdict=%s)",
            score, threshold, verdict,
        )
        return StageResult(
            stage=Stage.QUALITY_GATE,
            status=StageStatus.FAILED,
            artifacts=("quality_report.json", "fabrication_flags.json"),
            evidence_refs=("stage-20/quality_report.json",),
            error=f"Quality score {score:.1f}/10 below threshold {threshold:.1f}. "
                  f"Paper needs revision before export.",
        )

    logger.info(
        "Quality gate PASSED: score %.1f >= threshold %.1f",
        score, threshold,
    )
    return StageResult(
        stage=Stage.QUALITY_GATE,
        status=StageStatus.DONE,
        artifacts=("quality_report.json", "fabrication_flags.json"),
        evidence_refs=("stage-20/quality_report.json",),
    )


# ---------------------------------------------------------------------------
# Stage 21: Knowledge Archive
# ---------------------------------------------------------------------------

def _execute_knowledge_archive(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    revised = _read_prior_artifact(run_dir, "paper_revised.md") or ""
    analysis = _read_best_analysis(run_dir)
    decision = _read_prior_artifact(run_dir, "decision.md") or ""
    preamble = _build_context_preamble(config, run_dir, include_goal=True)
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "knowledge_archive")
        sp = _pm.for_stage(
            "knowledge_archive",
            evolution_overlay=_overlay,
            preamble=preamble,
            decision=decision,
            analysis=analysis,
            revised=revised[:15000],
        )
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        archive = resp.content
    else:
        archive = f"""# Knowledge Archive

## Lessons Learned
- Preserve strict metric reporting protocol.
- Keep refinement logs aligned with code changes.

## Reproducibility
- Include exact experiment script and schedule.
- Capture run-level JSON metrics.

## Future Work
- Extend robustness and external validity checks.

Generated: {_utcnow_iso()}
"""
    (stage_dir / "archive.md").write_text(archive, encoding="utf-8")

    files: list[str] = []
    for stage_subdir in sorted(run_dir.glob("stage-*")):
        for artifact in sorted(stage_subdir.rglob("*")):
            if artifact.is_file() and artifact != (stage_dir / "bundle_index.json"):
                files.append(str(artifact.relative_to(run_dir)))
    index = {
        "run_id": run_dir.name,
        "generated": _utcnow_iso(),
        "artifact_count": len(files),
        "artifacts": files,
    }
    (stage_dir / "bundle_index.json").write_text(
        json.dumps(index, indent=2), encoding="utf-8"
    )
    return StageResult(
        stage=Stage.KNOWLEDGE_ARCHIVE,
        status=StageStatus.DONE,
        artifacts=("archive.md", "bundle_index.json"),
        evidence_refs=("stage-21/archive.md", "stage-21/bundle_index.json"),
    )


# ---------------------------------------------------------------------------
# _sanitize_fabricated_data helper
# ---------------------------------------------------------------------------

def _sanitize_fabricated_data(
    paper: str,
    run_dir: Path,
) -> tuple[str, dict[str, Any]]:
    """Replace unverified numerical data in markdown tables with '---'.

    Loads experiment_summary.json as ground truth, extracts all verified
    metric values, then scans markdown tables in Results/Experiment sections.
    Numbers not matching any verified value (within 1% relative tolerance)
    are replaced with ``---``.

    Returns (sanitized_paper, sanitization_report).
    """
    import re as _re_san

    # --- 1. Build verified values set from experiment_summary.json ---
    # BUG-222: After REFINE cycles, merging ALL stage-14* data creates a
    # permissive registry that validates fabricated numbers from regressed
    # iterations.  Use ONLY the promoted best data as ground truth.
    # experiment_summary_best.json is written by _promote_best_stage14() and
    # contains the single best iteration's data.
    verified_values: set[float] = set()

    def _richness(path: Path) -> int:
        """Score an experiment_summary.json by how many conditions it has."""
        try:
            d = json.loads(path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            return -1
        if not isinstance(d, dict):
            return -1
        conds = d.get("condition_summaries", {})
        metrics = d.get("metrics_summary", {})
        return len(conds) + len(metrics)

    # BUG-222: Prefer experiment_summary_best.json (promoted best iteration).
    # Only fall back to "richest stage-14*" scanning if best.json is missing
    # (single-iteration runs without REFINE).
    _root_best = run_dir / "experiment_summary_best.json"
    if _root_best.exists() and _richness(_root_best) > 0:
        exp_path = _root_best
    else:
        _candidates = list(run_dir.glob("stage-14*/experiment_summary.json"))
        exp_path = max(_candidates, key=_richness) if _candidates else run_dir / "stage-14" / "experiment_summary.json"

    if exp_path.exists():
        try:
            exp_data = json.loads(exp_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            exp_data = {}

        def _collect_numbers(obj: Any, depth: int = 0) -> None:
            if depth > 10:
                return
            if isinstance(obj, (int, float)) and not isinstance(obj, bool):
                import math as _math_vv
                if _math_vv.isfinite(float(obj)):
                    verified_values.add(float(obj))
            elif isinstance(obj, dict):
                for v in obj.values():
                    _collect_numbers(v, depth + 1)
            elif isinstance(obj, list):
                for v in obj:
                    _collect_numbers(v, depth + 1)

        # Extract from well-known keys
        for key in (
            "metrics_summary", "condition_summaries", "best_run",
            "condition_metrics", "conditions", "ablation_results",
        ):
            if key in exp_data:
                _collect_numbers(exp_data[key])

    # BUG-222: Removed BUG-206 refinement_log scanning.  The original BUG-206
    # rationale was "Stage 17 injects sandbox metrics, so the sanitizer must
    # recognise them".  But that created a loophole: after REFINE regression,
    # the LLM would cite regressed iteration numbers and the sanitizer would
    # pass them because they were in the refinement log.  Now that Stage 17
    # also uses only the promoted best data (BUG-222), there is no need to
    # whitelist all sandbox metrics here.

    if not verified_values:
        report: dict[str, Any] = {
            "sanitized": False,
            "reason": "no verified values found in experiment_summary.json",
            "tables_processed": 0,
            "numbers_replaced": 0,
        }
        return paper, report

    def _is_verified(num: float) -> bool:
        """Check if num matches any verified value within 1% relative tolerance.

        BUG-R5-20: Also checks percentage/decimal cross-matching
        (e.g., 73.42 in paper vs 0.7342 in experiment, or vice versa).
        """
        for v in verified_values:
            if v == 0.0:
                if abs(num) < 1e-9:
                    return True
            elif abs(num - v) / abs(v) <= 0.01:
                return True
            # Cross-match: num might be percentage form of v (or vice versa)
            elif v != 0.0 and abs(num / 100.0 - v) / abs(v) <= 0.01:
                return True
            elif v != 0.0 and abs(num - v * 100.0) / abs(v * 100.0) <= 0.01:
                return True
        return False

    # --- 2. Find and sanitize markdown tables ---
    # BUG-175: Always-allowed set — common constants, hyperparameters, and
    # structural values that should never be sanitized (matches paper_verifier.py).
    _SANITIZER_ALWAYS_ALLOWED: set[float] = {
        0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0,
        0.5, 0.01, 0.001, 0.0001, 0.1, 0.05, 0.95, 0.99,
        2024.0, 2025.0, 2026.0, 2027.0,
        8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0,
        224.0, 299.0, 384.0,  # Common image sizes
        # BUG-192: Common hyperparameter values
        0.0003, 3e-4, 0.0005, 5e-4, 0.002, 2e-3,  # learning rates
        0.2, 0.3, 0.25, 0.7, 0.6, 0.8,  # clip epsilon, dropout, gradient clip, GCE q, common HP
        0.9, 0.999, 0.9999,  # Adam betas, momentum
        0.02, 0.03,  # weight init std
        1e-5, 1e-6, 1e-8,  # epsilon, weight decay
        300.0, 400.0, 500.0,  # epochs
        4096.0, 8192.0,  # larger batch sizes / hidden dims
    }
    # Match markdown table blocks (header + separator + data rows)
    table_pat = _re_san.compile(
        r"((?:^[ \t]*\|.+\|[ \t]*\n)+"  # one or more pipe-delimited lines
        r")",
        _re_san.MULTILINE,
    )
    # Match numbers in table cells (integers, decimals, percentages, scientific)
    # BUG-175: Also exclude hyphen in lookaround to protect method names like
    # "Cos-200", "StepLR-100" from partial number extraction.
    # BUG-206: Include Unicode hyphens (U+2010 hyphen, U+2011 non-breaking
    # hyphen, U+2013 en-dash) — LLMs frequently emit these instead of ASCII
    # hyphens in model names like "ResNet‑34".
    # BUG-206: Unicode hyphens placed before escaped ASCII hyphen (\\-)
    # to avoid creating unintended character ranges in the class.
    _HYPH = "\u2010\u2011\u2013\\-"  # U+2010 + U+2011 + U+2013 + ASCII hyphen
    num_pat = _re_san.compile(
        f"(?<![a-zA-Z_{_HYPH}])"  # not preceded by letter/underscore/any-hyphen
        r"(-?\d+\.?\d*(?:[eE][+-]?\d+)?)"
        r"(%?)"  # optional percent
        f"(?![a-zA-Z_{_HYPH}])"  # not followed by letter/underscore/any-hyphen
    )

    numbers_replaced = 0
    numbers_kept = 0
    tables_processed = 0
    replaced_values: list[str] = []

    # Shared helper — verifies a single number match, replaces if unverified.
    # Used by both markdown-table and LaTeX-tabular sanitizers.
    def _replace_num(m: _re_san.Match[str]) -> str:
        nonlocal numbers_replaced, numbers_kept
        num_str = m.group(1)
        pct = m.group(2)
        try:
            val = float(num_str)
        except ValueError:
            return m.group(0)
        # BUG-175: Always allow common constants / hyperparameters
        if val in _SANITIZER_ALWAYS_ALLOWED:
            numbers_kept += 1
            return m.group(0)
        # BUG-175: Small integer exemption — counts, indices,
        # epoch numbers, etc. (≤ 20 auto-pass)
        if val == int(val) and abs(val) <= 20:
            numbers_kept += 1
            return m.group(0)
        if _is_verified(val):
            numbers_kept += 1
            return m.group(0)
        numbers_replaced += 1
        replaced_values.append(num_str + pct)
        return "---"

    def _sanitize_table(match: _re_san.Match[str]) -> str:
        nonlocal numbers_replaced, numbers_kept, tables_processed
        table_text = match.group(0)
        lines = table_text.split("\n")

        # Check if this looks like a results/experiment table
        # (heuristic: has a separator row with dashes)
        has_separator = any(
            _re_san.match(r"^[ \t]*\|[\s:|-]+\|[ \t]*$", line)
            for line in lines
        )
        if not has_separator:
            return table_text

        # BUG-192: Detect hyperparameter/config tables and SKIP sanitization.
        # These tables contain design choices, not experimental results.
        _HP_TABLE_KW = {
            "hyperparameter", "hyper-parameter", "configuration", "config",
            "setting", "parameter", "learning rate", "lr", "batch size",
            "optimizer", "architecture", "schedule", "warmup", "decay",
            "dropout", "weight decay", "momentum", "epsilon", "clip",
        }
        # BUG-224: Statistical analysis tables contain derived values
        # (t-statistics, p-values, effect sizes) that are computed from
        # the experiment data but never appear in experiment_summary.json.
        # These tables should NOT be sanitized.
        _STAT_TABLE_KW = {
            "t-statistic", "t-stat", "t statistic", "p-value", "p value",
            "paired", "cohen", "effect size", "wilcoxon", "mann-whitney",
            "statistical", "significance", "confidence interval",
        }
        _RESULT_TABLE_KW = {
            "accuracy", "acc", "loss", "f1", "auroc", "auc", "precision",
            "recall", "bleu", "rouge", "reward", "return", "rmse", "mae",
            "mse", "error", "score", "metric", "performance", "improvement",
            "top-1", "top1", "top-5", "top5",
        }
        _header_lower = lines[0].lower() if lines else ""
        _is_hp_table = any(kw in _header_lower for kw in _HP_TABLE_KW)
        _is_result_table = any(kw in _header_lower for kw in _RESULT_TABLE_KW)
        # BUG-224: Statistical analysis tables (t-tests, p-values) contain
        # derived values that are never in experiment_summary.json.
        _is_stat_table = any(kw in _header_lower for kw in _STAT_TABLE_KW)
        if _is_hp_table and not _is_result_table:
            return table_text  # Skip sanitization for HP/config tables
        if _is_stat_table:
            return table_text  # Skip sanitization for statistical test tables

        # BUG-184: Per-column HP detection — classify each column header
        # as HP-type (skip sanitization) or result-type (sanitize).
        # This handles mixed tables like "| Method | LR | Acc | F1 |"
        # where LR should be preserved but Acc/F1 are verified.
        _HP_COL_KW = {
            "lr", "learning rate", "batch", "epoch", "optimizer",
            "schedule", "warmup", "decay", "dropout", "momentum",
            "clip", "epsilon", "eps", "beta", "alpha", "gamma",
            "lambda", "weight decay", "wd", "temperature", "temp",
            "hidden", "dim", "layers", "heads", "steps", "iterations",
            "seed", "patience", "#param", "params", "size", "depth",
            "width", "channels", "kernel", "stride", "padding",
            # BUG-224: Statistical test columns (derived, not in experiment data)
            "t-stat", "t stat", "p-value", "p value", "p-val",
            "cohen", "effect", "ci lower", "ci upper", "difference",
        }
        _hp_cols: set[int] = set()  # column indices that are HP columns
        if lines:
            _hdr_cells = lines[0].split("|")
            for _ci, _hc in enumerate(_hdr_cells):
                _hc_low = _hc.strip().lower()
                if any(kw in _hc_low for kw in _HP_COL_KW):
                    _hp_cols.add(_ci)

        tables_processed += 1
        sanitized_lines: list[str] = []
        for i, line in enumerate(lines):
            # Skip header row and separator row
            is_separator = bool(
                _re_san.match(r"^[ \t]*\|[\s:|-]+\|[ \t]*$", line)
            )
            is_header = i == 0  # first line is typically the header
            if is_separator or is_header:
                sanitized_lines.append(line)
                continue

            # BUG-175: Split by pipe and only sanitize cells after
            # the first data column (which typically contains method
            # names, condition labels, etc.)
            cells = line.split("|")
            sanitized_cells: list[str] = []

            for ci, cell in enumerate(cells):
                # Skip first non-empty cell (method/label column),
                # empty edge cells, and BUG-184 HP-classified columns
                if ci <= 1 or not cell.strip() or ci in _hp_cols:
                    sanitized_cells.append(cell)
                else:
                    sanitized_cells.append(
                        num_pat.sub(_replace_num, cell)
                    )
            sanitized_lines.append("|".join(sanitized_cells))
        return "\n".join(sanitized_lines)

    sanitized = table_pat.sub(_sanitize_table, paper)

    # --- BUG-211: LaTeX tabular sanitization ---
    # LLMs sometimes write results in LaTeX \begin{tabular} format inside
    # the markdown paper (often within ```latex fences).  The markdown
    # table regex above misses these entirely, allowing fabricated numbers
    # to pass through unchecked.
    latex_tab_pat = _re_san.compile(
        r"(\\begin\{tabular\}.*?\\end\{tabular\})",
        _re_san.DOTALL,
    )

    # Keywords for HP-table vs result-table classification (reuse from above)
    _LTX_HP_KW = {
        "hyperparameter", "hyper-parameter", "configuration", "config",
        "setting", "learning rate", "lr", "batch size", "optimizer",
    }
    _LTX_RESULT_KW = {
        "accuracy", "acc", "loss", "f1", "auroc", "auc", "precision",
        "recall", "reward", "score", "metric", "performance", "result",
    }
    # BUG-224: Statistical analysis LaTeX tables — derived values
    _LTX_STAT_KW = {
        "t-statistic", "t-stat", "t statistic", "p-value", "p value",
        "paired", "cohen", "effect size", "statistical", "significance",
    }

    def _sanitize_latex_table(match: _re_san.Match[str]) -> str:
        nonlocal tables_processed
        block = match.group(0)

        # Heuristic: look at the first ~300 chars (column spec + header row)
        # to decide HP vs result table.  Also check preceding \caption if
        # the match is part of a \begin{table} environment — we can look
        # backwards a bit in the full text for the caption.
        _start = match.start()
        _context = sanitized[max(0, _start - 300):_start + 300].lower()
        _is_hp = any(kw in _context for kw in _LTX_HP_KW)
        _is_res = any(kw in _context for kw in _LTX_RESULT_KW)
        # BUG-224: Statistical test tables — derived values not in experiment data
        _is_stat = any(kw in _context for kw in _LTX_STAT_KW)
        if _is_hp and not _is_res:
            return block  # HP/config table — skip
        if _is_stat:
            return block  # Statistical analysis table — skip

        tables_processed += 1

        # Split into rows by \\ (LaTeX row separator).
        # We split on \\ but keep the delimiter so we can reconstruct.
        parts = _re_san.split(r"(\\\\)", block)
        result_parts: list[str] = []
        _seen_midrule = False

        for part in parts:
            # Preserve row separators as-is
            if part == "\\\\":
                result_parts.append(part)
                continue

            _stripped = part.strip()
            # Rule lines — no numbers to sanitize
            if _re_san.search(
                r"\\(hline|toprule|midrule|bottomrule|cline|cmidrule)",
                _stripped,
            ):
                if "midrule" in _stripped or "hline" in _stripped:
                    _seen_midrule = True
                result_parts.append(part)
                continue

            # Column spec line (contains \begin{tabular}{...})
            if r"\begin{tabular}" in part:
                result_parts.append(part)
                continue

            # End line
            if r"\end{tabular}" in part:
                result_parts.append(part)
                continue

            # Header row: rows before the first \midrule/\hline
            if not _seen_midrule:
                result_parts.append(part)
                continue

            # Data row — split by & and sanitize cells after the first
            cells = part.split("&")
            sanitized_cells: list[str] = []
            for ci, cell in enumerate(cells):
                if ci == 0:
                    # First cell is method/condition name — preserve
                    sanitized_cells.append(cell)
                else:
                    sanitized_cells.append(num_pat.sub(_replace_num, cell))
            result_parts.append("&".join(sanitized_cells))

        return "".join(result_parts)

    sanitized = latex_tab_pat.sub(_sanitize_latex_table, sanitized)

    # --- Improvement F: Prose-level anti-fabrication ---
    # Scan Results/Experiments sections for inline numeric claims like
    # "achieved 94.2% accuracy" or "obtained an AUROC of 0.87".
    # Replace unverified numbers with "[value removed]".
    prose_numbers_replaced = 0
    _prose_pattern = _re_san.compile(
        r"(?:achiev|obtain|reach|attain|yield|report|record|produc|demonstrat|show|observ)"
        r"(?:ed|es|ing|s)?\s+"
        r"(?:an?\s+)?(?:\w+\s+)?(?:of\s+)?"
        r"(\d+\.?\d*)\s*"
        r"(%|\\%)?",
        _re_san.IGNORECASE,
    )
    # Only process lines in Results/Experiments sections
    _in_results_section = False
    _results_headers = _re_san.compile(
        r"^#{1,3}\s*(Results|Experiments|Experimental|Evaluation|Ablation)",
        _re_san.IGNORECASE,
    )
    _any_header = _re_san.compile(r"^#{1,3}\s+")
    _sanitized_lines = []
    for _line in sanitized.split("\n"):
        if _results_headers.match(_line):
            _in_results_section = True
        elif _any_header.match(_line) and _in_results_section:
            # Check if we're leaving Results for a different top-level section
            _header_text = _line.lstrip("#").strip().lower()
            if _header_text and not any(kw in _header_text for kw in
                    ("result", "experiment", "ablation", "evaluation", "comparison")):
                _in_results_section = False
        if _in_results_section and "|" not in _line:  # skip table rows
            def _replace_prose_num(m: _re_san.Match[str]) -> str:
                nonlocal prose_numbers_replaced
                num_str = m.group(1)
                try:
                    val = float(num_str)
                except ValueError:
                    return m.group(0)
                # Skip common constants / small integers
                if val in _SANITIZER_ALWAYS_ALLOWED:
                    return m.group(0)
                if val == int(val) and abs(val) <= 20:
                    return m.group(0)
                if _is_verified(val):
                    return m.group(0)
                prose_numbers_replaced += 1
                return m.group(0).replace(num_str + (m.group(2) or ""), "[value removed]")
            _line = _prose_pattern.sub(_replace_prose_num, _line)
        _sanitized_lines.append(_line)
    sanitized = "\n".join(_sanitized_lines)

    report = {
        "sanitized": numbers_replaced > 0 or prose_numbers_replaced > 0,
        "tables_processed": tables_processed,
        "numbers_replaced": numbers_replaced,
        "numbers_kept": numbers_kept,
        "prose_numbers_replaced": prose_numbers_replaced,
        "verified_values_count": len(verified_values),
        "replaced_samples": replaced_values[:20],
        "generated": _utcnow_iso(),
    }
    return sanitized, report


# ---------------------------------------------------------------------------
# BUG-176: Missing citation resolution
# BUG-194: Validate search results to avoid replacing correct entries with
#           garbage.  Previous code searched by cite-key fragments (e.g.
#           "he 2016 deep") which returned completely unrelated papers.
#           Fix: (1) consult seminal_papers.yaml first, (2) require title-
#           similarity validation for API results, (3) build better queries.
# ---------------------------------------------------------------------------

# Minimum title-similarity between search result and expected title/query
# for a result to be accepted.  Prevents "Jokowi and the New Developmentalism"
# from replacing "Deep Residual Learning for Image Recognition".
_CITATION_RESOLVE_MIN_SIMILARITY = 0.30


def _load_seminal_papers_by_key() -> dict[str, dict]:
    """Load seminal_papers.yaml and index by cite_key.

    Returns dict like::

        {"he2016deep": {"title": "Deep Residual Learning...", "authors": "He et al.", ...}, ...}

    Returns empty dict on any failure (missing file, bad YAML, etc.).
    """
    try:
        from researchclaw.data import _load_all as _load_seminal_all
        all_papers = _load_seminal_all()
        return {p["cite_key"]: p for p in all_papers if "cite_key" in p}
    except Exception:  # noqa: BLE001
        return {}


def _seminal_to_bibtex(paper: dict, cite_key: str) -> str:
    """Convert a seminal_papers.yaml entry dict to a BibTeX string."""
    title = paper.get("title", "Unknown")
    authors = paper.get("authors", "Unknown")
    year = paper.get("year", "")
    venue = paper.get("venue", "")

    # Decide entry type
    venue_lower = (venue or "").lower()
    is_conf = any(kw in venue_lower for kw in (
        "neurips", "nips", "icml", "iclr", "cvpr", "eccv", "iccv",
        "aaai", "acl", "emnlp", "naacl", "sigir", "kdd", "www",
        "ijcai", "conference", "proc", "workshop",
    ))
    if is_conf:
        return (
            f"@inproceedings{{{cite_key},\n"
            f"  title = {{{title}}},\n"
            f"  author = {{{authors}}},\n"
            f"  year = {{{year}}},\n"
            f"  booktitle = {{{venue}}},\n"
            f"}}"
        )
    return (
        f"@article{{{cite_key},\n"
        f"  title = {{{title}}},\n"
        f"  author = {{{authors}}},\n"
        f"  year = {{{year}}},\n"
        f"  journal = {{{venue}}},\n"
        f"}}"
    )


def _resolve_missing_citations(
    missing_keys: set[str],
    existing_bib: str,
) -> tuple[set[str], list[str]]:
    """Try to find BibTeX entries for citation keys not in references.bib.

    Parses each cite_key (e.g. ``hendrycks2017baseline``) into an author name
    and year, then searches academic APIs.  Returns ``(resolved_keys,
    new_bib_entries)`` where each entry is a complete BibTeX string.

    BUG-194 fix: Three-layer resolution strategy:
      1. **Seminal lookup** — check seminal_papers.yaml (zero API calls, exact match)
      2. **API search with validation** — search Semantic Scholar / arXiv, but ONLY
         accept results whose title has ≥ 30% word overlap with query terms.
         Previously any year-matching result was blindly accepted, causing
         foundational papers to be replaced with garbage.
      3. **Skip** — if no confident match, leave the citation unresolved rather
         than inject a wrong paper.

    Gracefully returns empty results on any network failure.
    """
    import re as _re176
    import time as _time176

    resolved: set[str] = set()
    new_entries: list[str] = []

    def _parse_cite_key(key: str) -> tuple[str, str, str]:
        """Extract (author, year, keyword_hint) from a citation key.

        Common patterns:
          ``he2016deep``       → ("he", "2016", "deep")
          ``vaswani2017attention`` → ("vaswani", "2017", "attention")
          ``goodfellow2014generative`` → ("goodfellow", "2014", "generative")
        """
        m = _re176.match(r"([a-zA-Z]+?)(\d{4})(.*)", key)
        if m:
            return m.group(1), m.group(2), m.group(3)
        return key, "", ""

    def _title_word_overlap(title: str, query_words: list[str]) -> float:
        """Word-overlap score between a paper title and query keywords.

        Returns fraction of query words found in the title (0.0–1.0).
        Used to validate that a search result is actually relevant.
        """
        if not query_words:
            return 0.0
        title_lower = set(
            _re176.sub(r"[^a-z0-9\s]", "", title.lower()).split()
        ) - {""}
        if not title_lower:
            return 0.0
        matched = sum(1 for w in query_words if w.lower() in title_lower)
        return matched / len(query_words)

    # --- Layer 1: Seminal papers lookup (no API calls) ---
    seminal_by_key = _load_seminal_papers_by_key()

    for key in sorted(missing_keys):
        if key in seminal_by_key and key not in existing_bib:
            sp = seminal_by_key[key]
            bib_entry = _seminal_to_bibtex(sp, key)
            new_entries.append(bib_entry)
            resolved.add(key)
            logger.info(
                "BUG-194: Resolved %r via seminal_papers.yaml → %r (%s)",
                key, sp.get("title", "")[:60], sp.get("year", ""),
            )

    # Remaining keys that weren't in the seminal database AND aren't already
    # present in the existing bib (no point re-resolving keys we already have).
    remaining = sorted(
        k for k in (missing_keys - resolved) if k not in existing_bib
    )
    if not remaining:
        return resolved, new_entries

    # --- Layer 2: API search with title-similarity validation ---
    try:
        from researchclaw.literature.search import search_papers
    except ImportError:
        logger.debug("BUG-176: literature.search not available, skipping resolution")
        return resolved, new_entries

    for key in remaining:
        author, year, hint = _parse_cite_key(key)
        if not author or not year:
            continue

        # BUG-194: Build a better search query.
        # Instead of "he 2016 deep", use "he deep residual learning 2016" or
        # at minimum, split camelCase hints into separate words.
        # Split hint on word boundaries (camelCase or underscore).
        hint_words = _re176.findall(r"[a-zA-Z]+", hint) if hint else []
        # The query words used for validation
        query_words = [author] + hint_words

        # Build search query: author + hint words + year (year helps but isn't
        # the primary discriminator anymore)
        query_parts = [author] + hint_words + [year]
        query = " ".join(query_parts)

        try:
            results = search_papers(query, limit=5, deduplicate=True)
        except Exception as exc:
            logger.debug("BUG-176: Search failed for %r: %s", key, exc)
            continue

        if not results:
            logger.debug(
                "BUG-194: No search results for %r (query=%r), skipping",
                key, query,
            )
            continue

        # BUG-194: Find best match by title-word-overlap AND year match.
        # Previously the code just took the first year-matching result.
        best = None
        best_score = -1.0
        for paper in results:
            overlap = _title_word_overlap(paper.title, query_words)
            year_bonus = 0.2 if str(paper.year) == year else 0.0
            # Also give bonus for author name appearing in paper.authors
            author_bonus = 0.0
            if any(author.lower() in a.name.lower() for a in paper.authors):
                author_bonus = 0.2
            score = overlap + year_bonus + author_bonus
            if score > best_score:
                best_score = score
                best = paper

        if best is None:
            continue

        # BUG-194: Validate the result — require minimum similarity.
        # This is the KEY fix: previously ANY result was accepted blindly.
        overlap = _title_word_overlap(best.title, query_words)
        if overlap < _CITATION_RESOLVE_MIN_SIMILARITY:
            logger.info(
                "BUG-194: Rejecting search result for %r — title %r has "
                "too-low overlap (%.2f < %.2f) with query words %r",
                key, best.title[:60], overlap,
                _CITATION_RESOLVE_MIN_SIMILARITY, query_words,
            )
            continue

        # Year must also match (or be within 1 year — sometimes conferences
        # vs arXiv preprint have different years)
        if year and best.year:
            year_diff = abs(int(year) - int(best.year))
            if year_diff > 1:
                logger.info(
                    "BUG-194: Rejecting search result for %r — year mismatch "
                    "(%s vs %s, diff=%d)",
                    key, year, best.year, year_diff,
                )
                continue

        # Generate BibTeX with the ORIGINAL cite_key (so \cite{key} works)
        bib_entry = best.to_bibtex()
        # Replace the auto-generated cite_key with the one used in the paper
        orig_key_match = _re176.match(r"@(\w+)\{([^,]+),", bib_entry)
        if orig_key_match:
            bib_entry = bib_entry.replace(
                f"@{orig_key_match.group(1)}{{{orig_key_match.group(2)},",
                f"@{orig_key_match.group(1)}{{{key},",
                1,
            )

        # Verify entry doesn't duplicate an existing key
        if key not in existing_bib:
            new_entries.append(bib_entry)
            resolved.add(key)
            logger.info(
                "BUG-194: Resolved %r via API → %r (%s, overlap=%.2f)",
                key, best.title[:60], best.year, overlap,
            )
        else:
            logger.debug(
                "BUG-194: Key %r already in bib, skipping API result", key,
            )

        # Rate limit: 0.5s between API calls
        _time176.sleep(0.5)

    return resolved, new_entries


# ---------------------------------------------------------------------------
# Stage 22: Export & Publish
# ---------------------------------------------------------------------------

def _execute_export_publish(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    revised = _read_prior_artifact(run_dir, "paper_revised.md") or ""
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "export_publish")
        sp = _pm.for_stage("export_publish", evolution_overlay=_overlay, revised=revised)
        resp = _chat_with_prompt(
            llm,
            sp.system,
            sp.user,
            json_mode=sp.json_mode,
            max_tokens=sp.max_tokens,
        )
        final_paper = resp.content
        # Content guard: reject LLM output that truncates the paper
        if revised and len(final_paper) < 0.6 * len(revised):
            logger.warning(
                "Stage 22: LLM output is %.0f%% of input length — using original",
                100 * len(final_paper) / max(len(revised), 1),
            )
            final_paper = revised
    else:
        final_paper = revised
    if not final_paper.strip():
        final_paper = "# Final Paper\n\nNo content generated."

    # --- Always-on fabrication sanitization (Phase 1 anti-fabrication) ---
    # Back up pre-sanitized version
    (stage_dir / "paper_presanitized.md").write_text(
        final_paper, encoding="utf-8"
    )

    # Sanitize unverified data in tables — always-on, not just degraded mode
    final_paper, _san_report = _sanitize_fabricated_data(
        final_paper, run_dir
    )
    (stage_dir / "sanitization_report.json").write_text(
        json.dumps(_san_report, indent=2), encoding="utf-8"
    )
    if _san_report.get("numbers_replaced", 0) > 0:
        logger.info(
            "Stage 22: Fabrication sanitization — %d numbers replaced, %d kept",
            _san_report.get("numbers_replaced", 0),
            _san_report.get("numbers_kept", 0),
        )

    # Graceful degradation: insert notice only when quality gate was degraded
    _degradation_signal_path = run_dir / "degradation_signal.json"
    if _degradation_signal_path.exists():
        try:
            _deg_signal = json.loads(
                _degradation_signal_path.read_text(encoding="utf-8")
            )
        except (json.JSONDecodeError, OSError):
            _deg_signal = {}

        # Insert degradation notice after abstract
        _deg_score = _deg_signal.get("score", "N/A")
        _deg_threshold = _deg_signal.get("threshold", "N/A")
        _deg_notice = (
            "\n\n> **Note:** This paper was produced in degraded mode. "
            f"Quality gate score ({_deg_score}/{_deg_threshold}) was below "
            "threshold. Unverified numerical results in tables have been "
            "replaced with `---` and require independent verification.\n\n"
        )
        # Try to insert after ## Abstract section
        _abstract_markers = ["## Abstract\n", "# Abstract\n"]
        _notice_inserted = False
        for _marker in _abstract_markers:
            if _marker in final_paper:
                _marker_end = final_paper.index(_marker) + len(_marker)
                # Find the end of the abstract paragraph
                _next_section = final_paper.find("\n## ", _marker_end)
                _next_heading = final_paper.find("\n# ", _marker_end)
                _insert_pos = min(
                    p for p in (_next_section, _next_heading)
                    if p > 0
                ) if any(p > 0 for p in (_next_section, _next_heading)) else len(final_paper)
                final_paper = (
                    final_paper[:_insert_pos]
                    + _deg_notice
                    + final_paper[_insert_pos:]
                )
                _notice_inserted = True
                break
        if not _notice_inserted:
            # Fallback: prepend to paper
            final_paper = _deg_notice + final_paper

        logger.info(
            "Stage 22: Applied degraded-mode notice (score=%s, threshold=%s)",
            _deg_score, _deg_threshold,
        )

    # IMP-3: Deduplicate "due to computational constraints" — keep at most 1
    import re as _re_imp3
    _CONSTRAINT_PAT = _re_imp3.compile(
        r"[Dd]ue to computational constraints", _re_imp3.IGNORECASE
    )
    _matches = list(_CONSTRAINT_PAT.finditer(final_paper))
    if len(_matches) > 1:
        # Keep only the first occurrence; remove subsequent ones by
        # deleting the enclosing sentence.
        for m in reversed(_matches[1:]):
            # Find sentence boundaries around the match
            start = final_paper.rfind(".", 0, m.start())
            start = start + 1 if start >= 0 else m.start()
            end = final_paper.find(".", m.end())
            end = end + 1 if end >= 0 else m.end()
            sentence = final_paper[start:end].strip()
            if sentence:
                final_paper = final_paper[:start] + final_paper[end:]
        final_paper = re.sub(r"[^\S\n]{2,}", " ", final_paper)
        logger.info(
            "Stage 22: Removed %d duplicate 'computational constraints' "
            "disclaimers",
            len(_matches) - 1,
        )

    # IMP-19 Layer 2: Ensure at least figures are referenced in the paper
    import re as _re_fig
    chart_files = []
    # BUG-215: Also search stage-14* versioned dirs (stage-14_v1, etc.)
    # in case stage-14/ was renamed and never recreated.
    _chart_search_dirs = [stage_dir / "charts", run_dir / "stage-14" / "charts"]
    for _s14_charts in sorted(run_dir.glob("stage-14*/charts"), reverse=True):
        if _s14_charts not in _chart_search_dirs:
            _chart_search_dirs.append(_s14_charts)
    for _chart_src_dir in _chart_search_dirs:
        if _chart_src_dir.is_dir():
            chart_files.extend(sorted(_chart_src_dir.glob("*.png")))
    # BUG-190: Also inject charts not already referenced in the paper.
    # The old condition only fired when NO figures were present. Now we
    # filter to only unreferenced charts, so partially-illustrated papers
    # also get the remaining charts injected.
    _already_referenced = set()
    for _cf in chart_files:
        if _cf.name in final_paper:
            _already_referenced.add(_cf.name)
    chart_files = [cf for cf in chart_files if cf.name not in _already_referenced]
    if chart_files:
        # Distribute figures to relevant sections based on filename keywords
        _fig_placement: dict[str, list[str]] = {
            "method": [],       # architecture, method, model, pipeline diagrams
            "result": [],       # experiment, comparison, ablation charts
            "intro": [],        # concept, overview, illustration
        }
        _fig_counter = len(_already_referenced)  # start numbering after existing figs
        for cf in chart_files[:6]:
            _fig_counter += 1
            stem_lower = cf.stem.lower()
            label = cf.stem.replace("_", " ").title()
            fig_md = f"![Figure {_fig_counter}: {label}](charts/{cf.name})"
            if any(k in stem_lower for k in ("architecture", "model", "pipeline", "method", "flowchart")):
                _fig_placement["method"].append(fig_md)
            elif any(k in stem_lower for k in ("experiment", "comparison", "ablation", "result", "metric")):
                _fig_placement["result"].append(fig_md)
            elif any(k in stem_lower for k in ("concept", "overview", "illustration", "threat", "attack")):
                _fig_placement["intro"].append(fig_md)
            else:
                _fig_placement["result"].append(fig_md)  # default to results

        # Insert figures at relevant section boundaries.
        # BUG-200: Match both H1 (#) and H2 (##) headings — LLMs generate
        # either level depending on the writing_structure prompt.
        _section_markers = {
            "method": ["# Method", "## Method", "# Methodology", "## Methodology",
                        "# Approach", "## Approach", "# Framework", "## Framework",
                        "## 3. Method", "## 3 Method"],
            "result": ["# Results", "## Results", "# Experiments", "## Experiments",
                        "# Evaluation", "## Evaluation",
                        "## 5. Results", "## 4. Experiments", "## 5 Results"],
            "intro": ["# Related Work", "## Related Work", "# Background",
                       "## Background", "## 2. Related", "## 2 Related Work"],
        }
        _total_inserted = 0
        for category, figs in _fig_placement.items():
            if not figs:
                continue
            fig_block = "\n\n" + "\n\n".join(figs) + "\n\n"
            inserted = False
            for marker in _section_markers.get(category, []):
                if marker in final_paper:
                    # Insert BEFORE the marker section (so figure appears at end of previous section)
                    final_paper = final_paper.replace(marker, fig_block + marker, 1)
                    inserted = True
                    _total_inserted += len(figs)
                    break
            if not inserted:
                # Fallback: insert before Conclusion/Limitations/Discussion
                for fallback in ["# Conclusion", "## Conclusion",
                                 "# Limitations", "## Limitations",
                                 "# Discussion", "## Discussion"]:
                    if fallback in final_paper:
                        final_paper = final_paper.replace(fallback, fig_block + fallback, 1)
                        inserted = True
                        _total_inserted += len(figs)
                        break
            if not inserted:
                # BUG-200: Last resort — insert before closing fence marker
                # rather than appending after it (which puts content outside
                # the markdown fence and gets dropped by converter).
                _fence_end = final_paper.rfind("\n```")
                if _fence_end > 0:
                    final_paper = (
                        final_paper[:_fence_end] + fig_block + final_paper[_fence_end:]
                    )
                else:
                    final_paper += fig_block
                _total_inserted += len(figs)

        logger.info(
            "IMP-19: Injected %d figure references into paper_final.md (distributed across sections)",
            _total_inserted,
        )

    # IMP-24: Detect excessive number repetition
    _numbers_found = _re_fig.findall(r"\b\d+\.\d{2,}\b", final_paper)
    _num_counts = Counter(_numbers_found)
    _repeated = {n: c for n, c in _num_counts.items() if c > 3}
    if _repeated:
        logger.warning(
            "IMP-24: Numbers repeated >3 times: %s",
            _repeated,
        )

    (stage_dir / "paper_final.md").write_text(final_paper, encoding="utf-8")

    # --- Legacy fabrication sanitization (disabled — superseded by Phase 1 _sanitize_fabricated_data above) ---
    # Kept but guarded: Phase 1 always-on sanitization handles this now.
    # Only run if Phase 1 was somehow skipped (should never happen).
    _fab_flags_text = _read_prior_artifact(run_dir, "fabrication_flags.json") or ""
    _fab_flags = _safe_json_loads(_fab_flags_text, {}) if _fab_flags_text else {}
    if (
        isinstance(_fab_flags, dict)
        and _fab_flags.get("fabrication_suspected")
        and _san_report.get("numbers_replaced", 0) == 0  # Phase 1 didn't run/replace
    ):
        import re as _re_fab
        _real_vals = set()
        for rv in _fab_flags.get("real_metric_values", []):
            if isinstance(rv, (int, float)) and math.isfinite(rv):
                _real_vals.add(str(round(rv, 4)))
                _real_vals.add(str(round(rv, 2)))
                _real_vals.add(str(round(rv, 1)))
                if rv == int(rv):
                    _real_vals.add(str(int(rv)))

        def _sanitize_number(m: _re_fab.Match) -> str:  # type: ignore[name-defined]
            """Replace fabricated numbers with '--' but keep real ones."""
            num_str = m.group(0)
            # Keep the number if it matches any known real metric value
            try:
                num_val = float(num_str)
                if not math.isfinite(num_val):
                    return "--"
                rounded_strs = {
                    str(round(num_val, 4)),
                    str(round(num_val, 2)),
                    str(round(num_val, 1)),
                    *(
                        [str(int(num_val))] if num_val == int(num_val) else []
                    ),
                }
                if rounded_strs & _real_vals:
                    return num_str  # real value — keep it
            except (ValueError, OverflowError):
                return num_str
            return "--"

        # Only sanitize numbers in Results/Experiments/Evaluation/Ablation sections
        _result_section_pat = _re_fab.compile(
            r"(##\s*(?:\d+\.?\s*)?(?:Results|Experiments|Evaluation|Ablation"
            r"|Experimental Results|Quantitative).*?)(?=\n##\s|\Z)",
            _re_fab.DOTALL | _re_fab.IGNORECASE,
        )
        _sanitized_count = 0

        def _sanitize_section(sec_match: _re_fab.Match) -> str:  # type: ignore[name-defined]
            nonlocal _sanitized_count
            section_text = sec_match.group(0)
            # Replace decimal numbers (e.g., 73.42, 0.891) but NOT integers
            # that are likely structural (year, section number, figure number)
            def _replace_in_section(m: _re_fab.Match) -> str:  # type: ignore[name-defined]
                nonlocal _sanitized_count
                result = _sanitize_number(m)
                if result == "--":
                    _sanitized_count += 1
                return result
            return _re_fab.sub(
                r"\b\d+\.\d{1,6}\b", _replace_in_section, section_text
            )

        final_paper = _result_section_pat.sub(_sanitize_section, final_paper)

        if _sanitized_count > 0:
            logger.warning(
                "Stage 22: Fabrication sanitization — blanked %d unsupported "
                "numbers in Results sections (experiment had no real metrics)",
                _sanitized_count,
            )
            # Rewrite the sanitized paper
            (stage_dir / "paper_final.md").write_text(
                final_paper, encoding="utf-8"
            )

    # Initialize artifacts list
    artifacts = ["paper_final.md"]
    # F2.7: Post-process citations — [cite_key] → \cite{cite_key}
    # and copy final references.bib to export stage
    _ay_map: dict[str, str] = {}  # BUG-102: author-year → cite_key map
    bib_text = _read_prior_artifact(run_dir, "references.bib")
    if bib_text:
        # Replace [cite_key] patterns in the final paper with \cite{cite_key}
        # Collect all valid cite_keys from the bib file
        import re as _re

        valid_keys = set(_re.findall(r"@\w+\{([^,]+),", bib_text))

        # BUG-102: Recover author-year citations → [cite_key] format.
        # When Stage 19 (paper_revision) converts [cite_key] to [Author et al., 2024],
        # the downstream regex can't match them. Build a reverse map from bib entries.
        def _build_author_year_map(bib: str, keys: set[str]) -> dict[str, str]:
            """Build mapping from author-year patterns to cite_keys.

            Returns dict like:
              "Raissi et al., 2019" → "raissi2019physicsinformed"
              "Tavella and Randall, 2000" → "tavella2000pricing"
            """
            mapping: dict[str, str] = {}
            # Parse each bib entry for author + year
            # BUG-DA8-17: Allow newline OR whitespace before closing brace
            # Use \n} or just } at start-of-line to avoid greedy cross-entry match
            entry_pat = _re.compile(
                r"@\w+\{([^,]+),\s*(.*?)(?:\n\}|^[ \t]*\})", _re.DOTALL | _re.MULTILINE
            )
            for m in entry_pat.finditer(bib):
                key = m.group(1).strip()
                if key not in keys:
                    continue
                body = m.group(2)
                # Extract author field
                author_m = _re.search(
                    r"author\s*=\s*[\{\"](.*?)[\}\"]", body, _re.IGNORECASE
                )
                year_m = _re.search(
                    r"year\s*=\s*[\{\"]?(\d{4})[\}\"]?", body, _re.IGNORECASE
                )
                if not author_m or not year_m:
                    continue
                author_raw = author_m.group(1).strip()
                year = year_m.group(1)
                # Parse author names (split on " and ")
                authors = [a.strip() for a in _re.split(r"\s+and\s+", author_raw)]
                # Extract last names
                last_names = []
                for a in authors:
                    if "," in a:
                        last_names.append(a.split(",")[0].strip())
                    else:
                        parts = a.split()
                        last_names.append(parts[-1] if parts else a)
                if not last_names:
                    continue
                # Generate author-year patterns:
                # 1 author: "Smith, 2024"
                # 2 authors: "Smith and Jones, 2024"
                # 3+ authors: "Smith et al., 2024"
                if len(last_names) == 1:
                    patterns = [f"{last_names[0]}, {year}"]
                elif len(last_names) == 2:
                    patterns = [
                        f"{last_names[0]} and {last_names[1]}, {year}",
                        f"{last_names[0]} \\& {last_names[1]}, {year}",
                    ]
                else:
                    patterns = [
                        f"{last_names[0]} et al., {year}",
                        f"{last_names[0]} et al. {year}",
                    ]
                    # Also add "Smith and Jones, 2024" for first two authors
                    patterns.append(
                        f"{last_names[0]} and {last_names[1]}, {year}"
                    )
                for pat in patterns:
                    mapping[pat] = key
            return mapping

        _ay_map = _build_author_year_map(bib_text, valid_keys)
        if _ay_map:
            # Count how many author-year citations exist in the paper
            _ay_found = 0
            for _ay_pat in _ay_map:
                if _ay_pat in final_paper:
                    _ay_found += 1
            if _ay_found > 0:
                logger.info(
                    "Stage 22: Found %d author-year citation patterns — "
                    "converting back to [cite_key] format.",
                    _ay_found,
                )
                # Sort by longest pattern first to avoid partial matches
                for _ay_pat in sorted(_ay_map, key=len, reverse=True):
                    _ay_key = _ay_map[_ay_pat]
                    # Match [Author et al., 2024] or [Author and Jones, 2024; ...]
                    # Handle single-citation brackets
                    final_paper = final_paper.replace(
                        f"[{_ay_pat}]", f"[{_ay_key}]"
                    )
                    # Handle within multi-citation brackets [A et al., 2020; B et al., 2021]
                    # Replace the author-year segment only inside [...] brackets
                    final_paper = _re.sub(
                        r'\[([^\]]*?)' + _re.escape(_ay_pat) + r'([^\]]*?)\]',
                        lambda _m: '[' + _m.group(1) + _ay_key + _m.group(2) + ']',
                        final_paper,
                    )
                # Fix multi-key brackets: [key1; key2] → [key1, key2]
                # (author-year uses semicolons, cite-keys use commas)
                def _fix_semicolon_cites(m_sc: _re.Match[str]) -> str:
                    inner = m_sc.group(1)
                    # Only convert if ALL segments look like cite keys
                    parts = [p.strip() for p in inner.split(";")]
                    _ck = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*"
                    if all(_re.fullmatch(_ck, p) for p in parts):
                        return "[" + ", ".join(parts) + "]"
                    return m_sc.group(0)
                final_paper = _re.sub(
                    r"\[([^\]]+;[^\]]+)\]", _fix_semicolon_cites, final_paper
                )
                (stage_dir / "paper_final.md").write_text(
                    final_paper, encoding="utf-8"
                )

        # R10-Fix4: Citation cross-validation
        # BUG-187: Also parse multi-key brackets like [key1, key2, key3].
        # The old regex only matched single-key brackets [key2020word].
        _cite_key_pat = r"[a-zA-Z]+\d{4}[a-zA-Z0-9_-]*"
        cited_keys_in_paper: set[str] = set()
        # Single-key brackets
        for m in _re.finditer(rf"\[({_cite_key_pat})\]", final_paper):
            cited_keys_in_paper.add(m.group(1))
        # Multi-key brackets [key1, key2] or [key1; key2]
        for m in _re.finditer(r"\[([^\]]{10,300})\]", final_paper):
            inner = m.group(1)
            # Only parse if it looks like citation keys (has year-like digits)
            parts = _re.split(r"[,;]\s*", inner)
            if all(_re.fullmatch(_cite_key_pat, p.strip()) for p in parts if p.strip()):
                for p in parts:
                    if p.strip():
                        cited_keys_in_paper.add(p.strip())

        if valid_keys and cited_keys_in_paper:
            invalid_keys = cited_keys_in_paper - valid_keys
            if invalid_keys:
                logger.warning(
                    "Stage 22: Found %d citation keys in paper not in references.bib: %s",
                    len(invalid_keys),
                    ", ".join(sorted(invalid_keys)[:20]),
                )
                # BUG-176: Try to resolve missing citations before removing them.
                # Parse cite_key → search query, look up via academic APIs,
                # and add found entries to references.bib.
                resolved_keys: set[str] = set()
                new_bib_entries: list[str] = []
                if len(invalid_keys) <= 30:  # Sanity: don't flood APIs
                    resolved_keys, new_bib_entries = _resolve_missing_citations(
                        invalid_keys, bib_text
                    )
                    if resolved_keys:
                        valid_keys.update(resolved_keys)
                        bib_text += "\n" + "\n\n".join(new_bib_entries) + "\n"
                        logger.info(
                            "Stage 22: Resolved %d/%d missing citations via API lookup",
                            len(resolved_keys), len(invalid_keys),
                        )

                still_invalid = invalid_keys - resolved_keys
                if still_invalid:
                    # IMP-29: Remove remaining unresolvable citations from
                    # BOTH single-key and multi-key brackets.
                    import re as _re_imp29
                    for bad_key in still_invalid:
                        # Remove single-key brackets
                        final_paper = final_paper.replace(f"[{bad_key}]", "")
                        # Remove from multi-key brackets: [good, BAD, good] → [good, good]
                        def _remove_from_multi(m: _re.Match) -> str:
                            inner = m.group(1)
                            parts = [p.strip() for p in _re.split(r"[,;]\s*", inner)]
                            filtered = [p for p in parts if p != bad_key]
                            if not filtered:
                                return ""
                            return "[" + ", ".join(filtered) + "]"
                        final_paper = _re_imp29.sub(
                            r"\[([^\]]*\b" + _re.escape(bad_key) + r"\b[^\]]*)\]",
                            _remove_from_multi,
                            final_paper,
                        )
                    # Clean up whitespace artifacts from removed citations
                    final_paper = _re_imp29.sub(r"  +", " ", final_paper)
                    final_paper = _re_imp29.sub(r" ([.,;:)])", r"\1", final_paper)
                (stage_dir / "paper_final.md").write_text(final_paper, encoding="utf-8")
                if still_invalid:
                    (stage_dir / "invalid_citations.json").write_text(
                        json.dumps(sorted(still_invalid), indent=2), encoding="utf-8"
                    )
                    artifacts.append("invalid_citations.json")
                if resolved_keys:
                    (stage_dir / "resolved_citations.json").write_text(
                        json.dumps(sorted(resolved_keys), indent=2), encoding="utf-8"
                    )
                    artifacts.append("resolved_citations.json")

        final_paper_latex = final_paper  # default: no citation conversion
        if valid_keys:
            _CITE_KEY_PAT = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9]*"

            # Step 1: Convert multi-key brackets [key1, key2] → \cite{key1, key2}
            def _replace_multi_cite(m: _re.Match[str]) -> str:
                keys = [k.strip() for k in m.group(1).split(",")]
                matched = [k for k in keys if k in valid_keys]
                if matched:
                    return "\\cite{" + ", ".join(matched) + "}"
                return m.group(0)

            final_paper_latex = _re.sub(
                rf"\[({_CITE_KEY_PAT}(?:\s*,\s*{_CITE_KEY_PAT})+)\]",
                _replace_multi_cite,
                final_paper,
            )

            # Step 2: Convert single-key brackets [key] → \cite{key}
            def _replace_cite(m: _re.Match[str]) -> str:
                key = m.group(1)
                if key in valid_keys:
                    return f"\\cite{{{key}}}"
                return m.group(0)

            final_paper_latex = _re.sub(
                rf"\[({_CITE_KEY_PAT})\]", _replace_cite, final_paper_latex
            )

            # Step 3: Merge adjacent \cite{a} \cite{b} → \cite{a, b}
            def _merge_adjacent_cites(m: _re.Match[str]) -> str:
                keys = _re.findall(r"\\cite\{([^}]+)\}", m.group(0))
                return "\\cite{" + ", ".join(keys) + "}"

            final_paper_latex = _re.sub(
                r"\\cite\{[^}]+\}(?:\s*\\cite\{[^}]+\})+",
                _merge_adjacent_cites,
                final_paper_latex,
            )

            (stage_dir / "paper_final_latex.md").write_text(
                final_paper_latex, encoding="utf-8"
            )
            artifacts.append("paper_final_latex.md")
        # IMP-1: Prune uncited bibliography entries — keep only keys
        # that actually appear in the paper text (bracket or \cite form).
        if valid_keys:
            _all_cited: set[str] = set()
            # Bracket-format citations [key]
            _all_cited.update(
                _re.findall(r"\[([a-zA-Z]+\d{4}[a-zA-Z0-9_-]*)\]", final_paper)
            )
            # \cite{key, key2} format (original + latex-converted)
            for _src in (
                final_paper,
                final_paper_latex,
            ):
                for _cm in _re.finditer(r"\\cite\{([^}]+)\}", _src):
                    _all_cited.update(
                        k.strip() for k in _cm.group(1).split(",")
                    )
            uncited_keys = valid_keys - _all_cited
            if uncited_keys:
                bib_text = _remove_bibtex_entries(bib_text, uncited_keys)
                logger.info(
                    "Stage 22: Pruned %d uncited bibliography entries "
                    "(kept %d)",
                    len(uncited_keys),
                    len(valid_keys) - len(uncited_keys),
                )

        # Write final references.bib
        (stage_dir / "references.bib").write_text(bib_text, encoding="utf-8")
        artifacts.append("references.bib")
        logger.info(
            "Stage 22: Exported references.bib with %d entries",
            len(valid_keys) if valid_keys else 0,
        )

    # Conference template: generate .tex file
    try:
        from researchclaw.templates import get_template, markdown_to_latex

        tpl = get_template(config.export.target_conference)
        # Use the latex-citation-processed version if available
        tex_source = final_paper_latex
        # Append NeurIPS-style checklist if target is a ML conference
        if tpl.name in ("neurips_2024", "neurips_2025", "icml_2025", "icml_2026",
                         "iclr_2025", "iclr_2026"):
            _has_exp = bool(_read_prior_artifact(run_dir, "experiment_summary.json"))
            _checklist = _generate_neurips_checklist(
                has_experiments=_has_exp,
                has_code=True,
            )
            if "NeurIPS Paper Checklist" not in tex_source:
                tex_source = tex_source.rstrip() + "\n\n" + _checklist
        _t = _extract_paper_title(tex_source)
        tex_content = markdown_to_latex(
            tex_source,
            tpl,
            title=_t if _t != "Untitled Paper" else "",
            authors=config.export.authors,
            bib_file=config.export.bib_file,
            bib_entries=_ay_map or None,
        )
        (stage_dir / "paper.tex").write_text(tex_content, encoding="utf-8")
        artifacts.append("paper.tex")
        logger.info(
            "Stage 22: Generated paper.tex for %s (%d chars)",
            tpl.display_name,
            len(tex_content),
        )
        # --- Phase 1 anti-fabrication: verify paper against VerifiedRegistry ---
        _vresult = None  # BUG-DA8-04: Initialize before try to avoid fragile dir() check
        try:
            from researchclaw.pipeline.paper_verifier import verify_paper as _verify_paper
            # BUG-222: Use best_only=True to validate against promoted best data only
            from researchclaw.pipeline.verified_registry import (
                VerifiedRegistry as _VR22,
            )
            _vr22 = _VR22.from_run_dir(
                run_dir,
                metric_direction=config.experiment.metric_direction,
                best_only=True,
            )
            if _vr22.values:
                _vresult = _verify_paper(tex_content, _vr22)
                (stage_dir / "paper_verification.json").write_text(
                    json.dumps({
                        "passed": _vresult.passed,
                        "severity": _vresult.severity,
                        "total_checked": _vresult.total_numbers_checked,
                        "total_verified": _vresult.total_numbers_verified,
                        "strict_violations": _vresult.strict_violations,
                        "lenient_violations": _vresult.lenient_violations,
                        "fabrication_rate": round(_vresult.fabrication_rate, 4),
                        "unverified_numbers": [
                            {"value": u.value, "line": u.line_number,
                             "section": u.section, "in_table": u.in_table}
                            for u in _vresult.unverified_numbers[:20]
                        ],
                        "fabricated_conditions": [
                            {"name": fc.name, "line": fc.line_number}
                            for fc in _vresult.fabricated_conditions
                        ],
                        "config_warnings": getattr(_vresult, "config_warnings", []),
                        "summary": _vresult.summary,
                    }, indent=2),
                    encoding="utf-8",
                )
                logger.info(
                    "Stage 22: Paper verification — %s (%d checked, %d verified, "
                    "%d strict violations, fabrication_rate=%.1f%%)",
                    _vresult.severity,
                    _vresult.total_numbers_checked,
                    _vresult.total_numbers_verified,
                    _vresult.strict_violations,
                    _vresult.fabrication_rate * 100,
                )
        except Exception as _pv_exc:
            logger.debug("Stage 22: Paper verification skipped: %s", _pv_exc)

        # BUG-23 P1: Enforce REJECT verdict — sanitize unverified numbers
        if _vresult is not None and getattr(_vresult, "severity", None) == "REJECT":
            logger.warning(
                "Stage 22: Paper REJECTED by verifier (fabrication_rate=%.1f%%, "
                "%d strict violations). Sanitizing unverified numbers.",
                _vresult.fabrication_rate * 100,
                _vresult.strict_violations,
            )
            # Replace unverified numbers in strict sections/tables with "---"
            import re as _re_san2

            # BUG-R49-02: Section names that sound like results but are
            # actually protocol/setup sections should NOT trigger strict
            # sanitization.  Exempt sections containing "dataset", "setup",
            # "protocol", "hyperparameter", or "implementation".
            _STRICT_EXEMPT_KW = {"dataset", "setup", "protocol",
                                 "hyperparameter", "implementation",
                                 "hardware", "infrastructure"}

            _sanitized_tex = tex_content
            _san2_count = 0
            for _uv in sorted(_vresult.unverified_numbers, key=lambda u: -u.line_number):
                # Only sanitize strict-section / in-table numbers
                _uv_section_lower = (_uv.section or "").lower()
                _uv_is_strict = any(
                    s in _uv_section_lower
                    for s in ("results", "experiment", "evaluation",
                              "ablation", "comparison", "analysis")
                )
                # BUG-R49-02: Exempt protocol/setup sections from strict mode
                if _uv_is_strict and any(
                    kw in _uv_section_lower for kw in _STRICT_EXEMPT_KW
                ):
                    _uv_is_strict = False
                if _uv_is_strict or _uv.in_table:
                    _lines = _sanitized_tex.split("\n")
                    if 0 < _uv.line_number <= len(_lines):
                        _orig_line = _lines[_uv.line_number - 1]
                        # BUG-R49-01: Use word-boundary regex instead of
                        # naive substring matching to avoid replacing numbers
                        # inside identifiers (e.g. "18" in "ResNet18").
                        # BUG-206: Include ASCII hyphen and Unicode hyphens
                        # (U+2010 hyphen, U+2011 non-breaking hyphen,
                        # U+2013 en-dash) so that model variant numbers
                        # like "34" in "ResNet-34" or "ResNet‑34" are not
                        # mistaken for unverified experimental values.
                        # BUG-210: Include period (.) so that fractional
                        # parts of decimals in condition names like
                        # "ema_decay_0.9" are not treated as standalone
                        # numbers (prevents "0.9" → "0.---").
                        _BOUNDARY = "A-Za-z0-9_\u2010\u2011\u2013\\-."
                        for _rep in (
                            f"{_uv.value:.4f}".rstrip("0").rstrip("."),
                            f"{_uv.value:.3f}",
                            f"{_uv.value:.2f}",
                            f"{_uv.value:.1f}",
                            f"{_uv.value:g}",
                            str(_uv.value),
                        ):
                            # Word boundary: number must NOT be adjacent to
                            # alphanumeric, underscore, or hyphen on either side.
                            _pat = (
                                rf"(?<![{_BOUNDARY}])"
                                + _re_san2.escape(_rep)
                                + rf"(?![{_BOUNDARY}])"
                            )
                            if _re_san2.search(_pat, _orig_line):
                                _lines[_uv.line_number - 1] = _re_san2.sub(
                                    _pat, "---", _orig_line, count=1,
                                )
                                _san2_count += 1
                                break
                        _sanitized_tex = "\n".join(_lines)
            if _sanitized_tex != tex_content:
                tex_content = _sanitized_tex
                (stage_dir / "paper.tex").write_text(tex_content, encoding="utf-8")
                logger.info(
                    "Stage 22: Sanitized paper.tex — replaced %d unverified "
                    "numbers with '---'",
                    _san2_count,
                )

        # Copy bundled style files alongside paper.tex
        for sf in tpl.get_style_files():
            import shutil as _shutil_sty
            _shutil_sty.copy2(sf, stage_dir / sf.name)

        # --- Pre-compilation: copy charts and fix figure paths ---
        # BUG-R41-12: Charts MUST be available before compile_latex(),
        # otherwise \includegraphics references fail → "Float(s) lost".
        try:
            chart_dir = stage_dir / "charts"
            chart_dir.mkdir(parents=True, exist_ok=True)
            charts: list[Path] = []

            # Copy FigureAgent charts from stage-14 (any version)
            _fa_charts_found = False
            for _fa_dir in sorted(run_dir.glob("stage-14*/charts"), reverse=True):
                _fa_pngs = list(_fa_dir.glob("fig_*.png"))
                if _fa_pngs:
                    import shutil
                    for _fa_png in _fa_pngs:
                        dest = chart_dir / _fa_png.name
                        shutil.copy2(_fa_png, dest)
                        charts.append(dest)
                    _fa_charts_found = True
                    logger.info(
                        "Stage 22: Copied %d FigureAgent charts from %s",
                        len(_fa_pngs), _fa_dir,
                    )
                    break

            # Generate structured charts from visualize.py
            from researchclaw.experiment.visualize import generate_all_charts
            _metric_dir = getattr(config.experiment, "metric_direction", "minimize")
            _viz_charts = generate_all_charts(
                run_dir,
                chart_dir,
                metric_key=config.experiment.metric_key,
                metric_direction=_metric_dir,
            )
            charts.extend(_viz_charts)

            if charts:
                artifacts.append("charts/")
                logger.info("Stage 22: Generated %d chart(s) total", len(charts))
        except Exception as exc:  # noqa: BLE001
            logger.warning("Chart generation failed: %s", exc)

        # BUG-99: Fix \includegraphics paths that don't match actual chart files
        try:
            reconcile_figure_refs(stage_dir / "paper.tex", stage_dir / "charts")
        except Exception as _fig_exc:  # noqa: BLE001
            logger.debug("Stage 22: Figure path validation skipped: %s", _fig_exc)

        # BUG-R41-12: Remove figure blocks referencing files that still don't exist
        try:
            tex_path = stage_dir / "paper.tex"
            if tex_path.exists():
                from researchclaw.templates.compiler import remove_missing_figures
                _tex_text = tex_path.read_text(encoding="utf-8")
                _fixed_tex, _removed_figs = remove_missing_figures(_tex_text, stage_dir)
                if _removed_figs:
                    tex_path.write_text(_fixed_tex, encoding="utf-8")
                    logger.warning(
                        "Stage 22: Removed %d figure block(s) with missing images: %s",
                        len(_removed_figs), _removed_figs,
                    )
        except Exception as _rmf_exc:  # noqa: BLE001
            logger.debug("Stage 22: remove_missing_figures skipped: %s", _rmf_exc)

        # Compile verification
        try:
            from researchclaw.templates.compiler import compile_latex
            _compile_result = compile_latex(stage_dir / "paper.tex", max_attempts=2)
            if _compile_result.success:
                logger.info("Stage 22: LaTeX compilation verification PASSED")
                artifacts.append("paper.pdf")
                # PDF-as-reviewer: LLM-based visual review of compiled PDF
                _pdf_path = stage_dir / "paper.pdf"
                if _pdf_path.exists() and llm is not None:
                    try:
                        _pdf_review = _get_review_compiled_pdf()(
                            _pdf_path, llm, config.research.topic
                        )
                        if _pdf_review:
                            (stage_dir / "pdf_review.json").write_text(
                                json.dumps(_pdf_review, indent=2, ensure_ascii=False),
                                encoding="utf-8",
                            )
                            artifacts.append("pdf_review.json")
                            _pdf_score = _pdf_review.get("overall_score", 0)
                            if _pdf_score < 5:
                                logger.warning(
                                    "Stage 22: PDF visual review score %d/10 — %s",
                                    _pdf_score,
                                    _pdf_review.get("summary", ""),
                                )
                            else:
                                logger.info(
                                    "Stage 22: PDF visual review score %d/10",
                                    _pdf_score,
                                )
                    except Exception as _pdf_exc:  # noqa: BLE001
                        logger.debug("Stage 22: PDF review skipped: %s", _pdf_exc)
                # Post-compilation quality checks
                try:
                    from researchclaw.templates.compiler import check_compiled_quality
                    _qc = check_compiled_quality(stage_dir / "paper.tex")
                    if _qc.warnings_summary:
                        logger.warning(
                            "Stage 22: Quality checks: %s",
                            "; ".join(_qc.warnings_summary),
                        )
                    (stage_dir / "compilation_quality.json").write_text(
                        json.dumps({
                            "page_count": _qc.page_count,
                            "unresolved_refs": _qc.unresolved_refs,
                            "unresolved_cites": _qc.unresolved_cites,
                            "overfull_hboxes": len(_qc.overfull_hboxes),
                            "orphan_figures": _qc.orphan_figures,
                            "orphan_labels": _qc.orphan_labels,
                            "warnings": _qc.warnings_summary,
                        }, indent=2),
                        encoding="utf-8",
                    )
                    artifacts.append("compilation_quality.json")
                    # BUG-27: Warn if page count exceeds limit
                    _page_limit = 10
                    if _qc.page_count and _qc.page_count > _page_limit:
                        logger.warning(
                            "BUG-27: Paper is %d pages (limit %d). "
                            "Consider tightening content in revision.",
                            _qc.page_count, _page_limit,
                        )
                except Exception as _qc_exc:  # noqa: BLE001
                    logger.debug("Stage 22: Quality checks skipped: %s", _qc_exc)
            else:
                logger.warning("Stage 22: LaTeX compilation verification FAILED: %s", _compile_result.errors[:3])
                # Add compilation failure comment to .tex
                _tex_path = stage_dir / "paper.tex"
                if _tex_path.exists():
                    _tex_content = _tex_path.read_text(encoding="utf-8")
                    if "% WARNING: Compilation failed" not in _tex_content:
                        _tex_content = (
                            "% WARNING: Compilation failed. Errors:\n"
                            + "".join(f"% {e}\n" for e in _compile_result.errors[:5])
                            + _tex_content
                        )
                        _tex_path.write_text(_tex_content, encoding="utf-8")
        except Exception as _compile_exc:  # noqa: BLE001
            logger.debug("Stage 22: Compile verification skipped: %s", _compile_exc)
    except Exception as exc:  # noqa: BLE001
        logger.error("LaTeX generation failed: %s", exc, exc_info=True)

    # (Charts, BUG-99 path fix, and remove_missing_figures are now handled
    #  BEFORE compile_latex() — see "Pre-compilation" block above.)

    # --- Code packaging: multi-file directory or single file ---
    exp_final_dir_path = _read_prior_artifact(run_dir, "experiment_final/")
    if exp_final_dir_path and Path(exp_final_dir_path).is_dir():
        import ast

        code_dir = stage_dir / "code"
        code_dir.mkdir(parents=True, exist_ok=True)
        all_code_combined = ""
        code_file_names: list[str] = []
        for src in sorted(Path(exp_final_dir_path).glob("*.py")):
            (code_dir / src.name).write_bytes(src.read_bytes())
            all_code_combined += src.read_text(encoding="utf-8") + "\n"
            code_file_names.append(src.name)

        # Detect dependencies from all files
        detected: set[str] = set()
        known_packages = {
            "numpy": "numpy",
            "torch": "torch",
            "tensorflow": "tensorflow",
            "sklearn": "scikit-learn",
            "scikit-learn": "scikit-learn",
            "scipy": "scipy",
            "pandas": "pandas",
            "matplotlib": "matplotlib",
            "seaborn": "seaborn",
            "transformers": "transformers",
            "datasets": "datasets",
            "jax": "jax",
        }
        try:
            tree = ast.parse(all_code_combined)
            for node in ast.walk(tree):
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        top = alias.name.split(".")[0]
                        if top in known_packages:
                            detected.add(known_packages[top])
                elif isinstance(node, ast.ImportFrom) and node.module:
                    top = node.module.split(".")[0]
                    if top in known_packages:
                        detected.add(known_packages[top])
        except SyntaxError:
            pass

        requirements = sorted(detected)
        (code_dir / "requirements.txt").write_text(
            "\n".join(requirements) + ("\n" if requirements else ""),
            encoding="utf-8",
        )

        paper_title = _extract_paper_title(final_paper)
        file_list_md = "\n".join(f"- `{f}`" for f in code_file_names)
        readme = (
            f"# Code Package for {paper_title}\n\n"
            "## Description\n"
            "This directory contains the experiment project used for the paper.\n\n"
            "## Project Files\n"
            f"{file_list_md}\n\n"
            "## How to Run\n"
            "`python main.py`\n\n"
            "## Dependencies\n"
            "Install dependencies with `pip install -r requirements.txt` if needed.\n"
        )
        (code_dir / "README.md").write_text(readme, encoding="utf-8")
        artifacts.append("code/")
        logger.info(
            "Stage 22: Packaged multi-file code release (%d files, %d deps)",
            len(code_file_names),
            len(requirements),
        )
    else:
        # Backward compat: single-file packaging
        code_payload = _read_prior_artifact(run_dir, "experiment_final.py")
        if not code_payload:
            code_payload = _read_prior_artifact(run_dir, "experiment.py")
        if code_payload:
            import ast

            code_dir = stage_dir / "code"
            code_dir.mkdir(parents=True, exist_ok=True)
            (code_dir / "experiment.py").write_text(code_payload, encoding="utf-8")

            detected_single: set[str] = set()
            known_packages_single = {
                "numpy": "numpy",
                "torch": "torch",
                "tensorflow": "tensorflow",
                "sklearn": "scikit-learn",
                "scikit-learn": "scikit-learn",
                "scipy": "scipy",
                "pandas": "pandas",
                "matplotlib": "matplotlib",
                "seaborn": "seaborn",
                "transformers": "transformers",
                "datasets": "datasets",
                "jax": "jax",
            }
            try:
                tree = ast.parse(code_payload)
                for node in ast.walk(tree):
                    if isinstance(node, ast.Import):
                        for alias in node.names:
                            top = alias.name.split(".")[0]
                            if top in known_packages_single:
                                detected_single.add(known_packages_single[top])
                    elif isinstance(node, ast.ImportFrom) and node.module:
                        top = node.module.split(".")[0]
                        if top in known_packages_single:
                            detected_single.add(known_packages_single[top])
            except SyntaxError:
                pass

            requirements = sorted(detected_single)
            (code_dir / "requirements.txt").write_text(
                "\n".join(requirements) + ("\n" if requirements else ""),
                encoding="utf-8",
            )
            paper_title = _extract_paper_title(final_paper)
            readme = (
                f"# Code Package for {paper_title}\n\n"
                "## Description\n"
                "This directory contains the final experiment script used for the paper.\n\n"
                "## How to Run\n"
                "`python experiment.py`\n\n"
                "## Dependencies\n"
                "Install dependencies with `pip install -r requirements.txt` if needed.\n"
            )
            (code_dir / "README.md").write_text(readme, encoding="utf-8")
            artifacts.append("code/")
            logger.info(
                "Stage 22: Packaged single-file code release with %d deps",
                len(requirements),
            )
    # WS-5.5: Generate framework diagram prompt for methodology section
    try:
        _framework_prompt = _generate_framework_diagram_prompt(
            final_paper, config, llm=llm
        )
        if _framework_prompt:
            _chart_dir = stage_dir / "charts"
            _chart_dir.mkdir(parents=True, exist_ok=True)
            (_chart_dir / "framework_diagram_prompt.md").write_text(
                _framework_prompt, encoding="utf-8"
            )
            logger.info("Stage 22: Generated framework diagram prompt → charts/framework_diagram_prompt.md")
    except Exception as exc:  # noqa: BLE001
        logger.debug("Stage 22: Framework diagram prompt generation skipped: %s", exc)

    return StageResult(
        stage=Stage.EXPORT_PUBLISH,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-22/{a}" for a in artifacts),
    )


# ---------------------------------------------------------------------------
# Citation helpers
# ---------------------------------------------------------------------------

def _check_citation_relevance(
    llm: Any,
    topic: str,
    results: list[Any],
) -> dict[str, float | None]:
    """Use LLM to assess relevance of each citation to the research topic.

    Returns a dict mapping cite_key → relevance score (0.0–1.0).
    Processes citations in batches of 30 to handle large bibliographies.
    """
    citation_lines = []
    for cr in results:
        citation_lines.append(f"- [{cr.cite_key}] \"{cr.title}\"")
    if not citation_lines:
        return {}

    all_scores: dict[str, float] = {}
    _BATCH_SIZE = 30

    for batch_start in range(0, len(citation_lines), _BATCH_SIZE):
        batch = citation_lines[batch_start:batch_start + _BATCH_SIZE]
        citations_text = "\n".join(batch)

        prompt = (
            f"Research topic: {topic}\n\n"
            f"Rate the relevance of each citation to the research topic "
            f"on a scale of 0.0 to 1.0.\n"
            f"Return ONLY a JSON object mapping cite_key to relevance score.\n"
            f"Example: {{\"smith2020\": 0.9, \"jones2019\": 0.2}}\n\n"
            f"Citations:\n{citations_text}"
        )

        try:
            resp = llm.chat(
                [{"role": "user", "content": prompt}],
                system="You assess citation relevance. Return only valid JSON.",
                json_mode=True,
            )
            parsed = _safe_json_loads(resp.content, {})
            if isinstance(parsed, dict):
                for k, v in parsed.items():
                    if isinstance(v, (int, float)):
                        all_scores[k] = max(0.0, min(1.0, float(v)))
        except Exception:  # noqa: BLE001
            logger.debug(
                "Citation relevance check failed for batch %d–%d, skipping",
                batch_start, batch_start + len(batch),
            )

    return all_scores


def _remove_bibtex_entries(bib_text: str, keys_to_remove: set[str]) -> str:
    """Remove BibTeX entries whose keys are in *keys_to_remove*."""
    kept: list[str] = []
    for m in re.finditer(r"@\w+\{([^,]+),", bib_text):
        key = m.group(1).strip()
        if key in keys_to_remove:
            continue
        # Find the full entry (from @ to the next @ or end)
        start = m.start()
        # Find balanced braces
        depth = 0
        end = start
        for i in range(start, len(bib_text)):
            if bib_text[i] == "{":
                depth += 1
            elif bib_text[i] == "}":
                depth -= 1
                if depth == 0:
                    end = i + 1
                    break
        if end > start:
            kept.append(bib_text[start:end])
    return "\n\n".join(kept) + "\n" if kept else ""


def _remove_citations_from_text(text: str, keys_to_remove: set[str]) -> str:
    """Remove \\cite{key} and [key] references for specified citation keys."""

    # Handle multi-key LaTeX cites: \cite{a,b,c} → filter keys inside braces
    def _filter_cite(m: re.Match[str]) -> str:
        keys = [k.strip() for k in m.group(1).split(",")]
        kept = [k for k in keys if k not in keys_to_remove]
        if not kept:
            return ""
        return f"\\cite{{{','.join(kept)}}}"

    text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, text)

    # Markdown: [key]
    for key in keys_to_remove:
        text = re.sub(rf"\[{re.escape(key)}\]", "", text)
    return text


# ---------------------------------------------------------------------------
# Stage 23: Citation Verify
# ---------------------------------------------------------------------------

def _execute_citation_verify(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    from researchclaw.literature.verify import (
        VerifyStatus,
        annotate_paper_hallucinations,
        filter_verified_bibtex,
        verify_citations,
    )

    bib_text = _read_prior_artifact(run_dir, "references.bib") or ""
    paper_text = _read_prior_artifact(run_dir, "paper_final.md") or ""

    if not bib_text.strip():
        report_data = {
            "summary": {
                "total": 0,
                "verified": 0,
                "suspicious": 0,
                "hallucinated": 0,
                "skipped": 0,
                "integrity_score": 1.0,
            },
            "results": [],
            "note": "No references.bib found — nothing to verify.",
        }
        (stage_dir / "verification_report.json").write_text(
            json.dumps(report_data, indent=2), encoding="utf-8"
        )
        (stage_dir / "references_verified.bib").write_text(
            "% No references to verify\n", encoding="utf-8"
        )
        return StageResult(
            stage=Stage.CITATION_VERIFY,
            status=StageStatus.DONE,
            artifacts=("verification_report.json", "references_verified.bib"),
            evidence_refs=(
                "stage-23/verification_report.json",
                "stage-23/references_verified.bib",
            ),
        )

    s2_api_key = getattr(config.llm, "s2_api_key", "") or ""

    from researchclaw.literature.verify import parse_bibtex_entries
    _n_entries = len(parse_bibtex_entries(bib_text))
    logger.info(
        "[citation-verify] Verifying %d references "
        "(DOI→CrossRef > OpenAlex > arXiv > S2)…",
        _n_entries,
    )
    report = verify_citations(bib_text, s2_api_key=s2_api_key)
    logger.info(
        "[citation-verify] Done: %d verified, %d suspicious, "
        "%d hallucinated, %d skipped (integrity: %.0f%%)",
        report.verified,
        report.suspicious,
        report.hallucinated,
        report.skipped,
        report.integrity_score * 100,
    )

    # --- Relevance check: assess topical relevance of verified citations ---
    if llm is not None and report.results:
        relevance_scores = _check_citation_relevance(
            llm, config.research.topic, report.results
        )
        for cr in report.results:
            score = relevance_scores.get(cr.cite_key)
            if score is not None:
                cr.relevance_score = score

    # FIX-5: Filter low-relevance citations and enforce hard cap
    RELEVANCE_THRESHOLD = 0.5
    MAX_CITATIONS = 60
    low_relevance_keys: set[str] = set()
    for cr in report.results:
        if cr.relevance_score is not None and cr.relevance_score < RELEVANCE_THRESHOLD:
            low_relevance_keys.add(cr.cite_key)

    # Hard cap: if still above MAX_CITATIONS after relevance filter, drop lowest
    # BUG-07 fix: Unscored citations (relevance_score=None) default to 0.7
    # because they passed API verification and are likely relevant.
    # Previously they defaulted to 0.0 which caused mass-deletion.
    _DEFAULT_RELEVANCE = 0.7
    remaining = [
        cr for cr in report.results
        if cr.cite_key not in low_relevance_keys
        and cr.status != VerifyStatus.HALLUCINATED
    ]
    if len(remaining) > MAX_CITATIONS:
        remaining.sort(
            key=lambda c: c.relevance_score if c.relevance_score is not None else _DEFAULT_RELEVANCE,
        )
        overflow = remaining[:len(remaining) - MAX_CITATIONS]
        for cr in overflow:
            low_relevance_keys.add(cr.cite_key)
        logger.info(
            "Stage 23: Hard cap applied, dropping %d additional low-relevance citations",
            len(overflow),
        )

    if low_relevance_keys:
        logger.info(
            "Stage 23: Filtering %d low-relevance citations (threshold=%.1f, cap=%d): %s",
            len(low_relevance_keys),
            RELEVANCE_THRESHOLD,
            MAX_CITATIONS,
            ", ".join(sorted(list(low_relevance_keys)[:20])),
        )

    (stage_dir / "verification_report.json").write_text(
        json.dumps(report.to_dict(), indent=2), encoding="utf-8"
    )

    verified_bib = filter_verified_bibtex(bib_text, report, include_suspicious=True)
    # Remove low-relevance entries from BibTeX
    if low_relevance_keys:
        verified_bib = _remove_bibtex_entries(verified_bib, low_relevance_keys)

    # BUG-26: If verification stripped >50% of entries (e.g. due to rate limiting),
    # fall back to the original bib to avoid breaking the paper's references
    original_count = len(re.findall(r"@\w+\{", bib_text))
    verified_count = len(re.findall(r"@\w+\{", verified_bib))
    if original_count > 0 and verified_count < original_count * 0.5:
        logger.warning(
            "Stage 23: Verification stripped %d→%d entries (>50%% loss). "
            "Keeping original bib to avoid breaking references.",
            original_count, verified_count,
        )
        verified_bib = bib_text

    # IMP-1: Also prune uncited entries from verified bib
    # BUG-182: Also scan LaTeX paper.tex (not just Markdown) for \cite{} keys.
    # The Markdown version may use [key] notation while LaTeX uses \cite{key}.
    if paper_text.strip():
        _vbib_keys = set(re.findall(r"@\w+\{([^,]+),", verified_bib))
        _cited_in_paper: set[str] = set()
        _cited_in_paper.update(
            re.findall(r"\[([a-zA-Z]+\d{4}[a-zA-Z0-9_-]*)\]", paper_text)
        )
        for _cm in re.finditer(r"\\cite\{([^}]+)\}", paper_text):
            _cited_in_paper.update(
                k.strip() for k in _cm.group(1).split(",")
            )
        # BUG-182: Also read stage-22/paper.tex for \cite{} keys
        _latex_paper = stage_dir.parent / "stage-22" / "paper.tex"
        if _latex_paper.exists():
            try:
                _latex_text = _latex_paper.read_text(encoding="utf-8")
                for _cm in re.finditer(r"\\cite[pt]?\{([^}]+)\}", _latex_text):
                    _cited_in_paper.update(
                        k.strip() for k in _cm.group(1).split(",")
                    )
            except OSError:
                pass
        _uncited_vbib = _vbib_keys - _cited_in_paper
        if _uncited_vbib:
            verified_bib = _remove_bibtex_entries(verified_bib, _uncited_vbib)
            logger.info(
                "Stage 23: Pruned %d uncited entries from verified bib "
                "(kept %d)",
                len(_uncited_vbib),
                len(_vbib_keys) - len(_uncited_vbib),
            )

    # BUG-100: If all entries were filtered out (low-relevance + uncited pruning),
    # write a comment instead of an empty file to avoid "Missing or empty output" error.
    if not verified_bib.strip():
        verified_bib = "% All citations were filtered out during verification\n"
        logger.warning(
            "Stage 23: All BibTeX entries filtered out — writing placeholder"
        )

    (stage_dir / "references_verified.bib").write_text(verified_bib, encoding="utf-8")

    artifacts = ["verification_report.json", "references_verified.bib"]

    if paper_text.strip():
        annotated = annotate_paper_hallucinations(paper_text, report)
        # Remove \cite{} and [cite_key] references for low-relevance entries
        if low_relevance_keys:
            annotated = _remove_citations_from_text(annotated, low_relevance_keys)
        (stage_dir / "paper_final_verified.md").write_text(annotated, encoding="utf-8")
        artifacts.append("paper_final_verified.md")

    logger.info(
        "Stage 23 citation verify: %d total, %d verified, %d suspicious, "
        "%d hallucinated, %d skipped (integrity=%.1f%%)",
        report.total,
        report.verified,
        report.suspicious,
        report.hallucinated,
        report.skipped,
        report.integrity_score * 100,
    )

    return StageResult(
        stage=Stage.CITATION_VERIFY,
        status=StageStatus.DONE,
        artifacts=tuple(artifacts),
        evidence_refs=tuple(f"stage-23/{a}" for a in artifacts),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_synthesis.py
================================================
"""Stages 7-8: Synthesis and hypothesis generation."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._helpers import (
    StageResult,
    _default_hypotheses,
    _get_evolution_overlay,
    _multi_perspective_generate,
    _parse_jsonl_rows,
    _read_prior_artifact,
    _synthesize_perspectives,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_synthesis(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    cards_path = _read_prior_artifact(run_dir, "cards/") or ""
    cards_context = ""
    if cards_path:
        snippets: list[str] = []
        for path in sorted(Path(cards_path).glob("*.md"))[:24]:
            snippets.append(path.read_text(encoding="utf-8"))
        cards_context = "\n\n".join(snippets)
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "synthesis")
        sp = _pm.for_stage(
            "synthesis",
            evolution_overlay=_overlay,
            topic=config.research.topic,
            cards_context=cards_context,
        )
        resp = llm.chat(
            [{"role": "user", "content": sp.user}],
            system=sp.system,
            max_tokens=sp.max_tokens or 8192,
        )
        synthesis_md = resp.content
    else:
        synthesis_md = f"""# Synthesis

## Cluster Overview
- Cluster A: Representation methods
- Cluster B: Training strategies
- Cluster C: Evaluation robustness

## Gap 1
Limited consistency across benchmark protocols.

## Gap 2
Under-reported failure behavior under distribution shift.

## Prioritized Opportunities
1. Unified experimental protocol
2. Robustness-aware evaluation suite

## Generated
{_utcnow_iso()}
"""
    (stage_dir / "synthesis.md").write_text(synthesis_md, encoding="utf-8")
    return StageResult(
        stage=Stage.SYNTHESIS,
        status=StageStatus.DONE,
        artifacts=("synthesis.md",),
        evidence_refs=("stage-07/synthesis.md",),
    )


def _execute_hypothesis_gen(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    synthesis = _read_prior_artifact(run_dir, "synthesis.md") or ""
    if llm is not None:
        _pm = prompts or PromptManager()
        from researchclaw.prompts import DEBATE_ROLES_HYPOTHESIS  # noqa: PLC0415

        # --- Multi-perspective debate ---
        perspectives_dir = stage_dir / "perspectives"
        variables = {"topic": config.research.topic, "synthesis": synthesis}
        perspectives = _multi_perspective_generate(
            llm, DEBATE_ROLES_HYPOTHESIS, variables, perspectives_dir
        )
        # BUG-S2: If all debate perspectives failed, fall back to defaults
        # instead of sending empty context to the LLM (pure hallucination).
        if not perspectives:
            logger.warning("All debate perspectives failed; using default hypotheses")
            hypotheses_md = _default_hypotheses(config.research.topic)
        else:
            # --- Synthesize into final hypotheses ---
            hypotheses_md = _synthesize_perspectives(
                llm, perspectives, "hypothesis_synthesize", _pm
            )
    else:
        hypotheses_md = _default_hypotheses(config.research.topic)
    (stage_dir / "hypotheses.md").write_text(hypotheses_md, encoding="utf-8")

    # --- Novelty check (non-blocking) ---
    novelty_artifacts: tuple[str, ...] = ()
    try:
        from researchclaw.literature.novelty import check_novelty  # noqa: PLC0415

        candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl") or ""
        papers_seen = _parse_jsonl_rows(candidates_text) if candidates_text else []
        novelty_report = check_novelty(
            topic=config.research.topic,
            hypotheses_text=hypotheses_md,
            papers_already_seen=papers_seen,
            s2_api_key=getattr(config.llm, "s2_api_key", ""),
        )
        (stage_dir / "novelty_report.json").write_text(
            json.dumps(novelty_report, indent=2, ensure_ascii=False),
            encoding="utf-8",
        )
        novelty_artifacts = ("novelty_report.json",)
        logger.info(
            "Novelty check: score=%.3f  assessment=%s  recommendation=%s",
            novelty_report["novelty_score"],
            novelty_report["assessment"],
            novelty_report["recommendation"],
        )
    except Exception:  # noqa: BLE001
        logger.warning("Novelty check failed (non-blocking)", exc_info=True)

    return StageResult(
        stage=Stage.HYPOTHESIS_GEN,
        status=StageStatus.DONE,
        artifacts=("hypotheses.md",) + novelty_artifacts,
        evidence_refs=("stage-08/hypotheses.md",),
    )


================================================
FILE: researchclaw/pipeline/stage_impls/_topic.py
================================================
"""Stages 1-2: Topic initialization and problem decomposition."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import TYPE_CHECKING

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.hardware import detect_hardware, ensure_torch_available
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline._domain import _detect_domain
from researchclaw.pipeline._helpers import (
    StageResult,
    _get_evolution_overlay,
    _read_prior_artifact,
    _safe_json_loads,
    _utcnow_iso,
)
from researchclaw.pipeline.stages import Stage, StageStatus
from researchclaw.prompts import PromptManager

logger = logging.getLogger(__name__)


def _execute_topic_init(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    topic = config.research.topic
    domains = (
        ", ".join(config.research.domains) if config.research.domains else "general"
    )
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "topic_init")
        sp = _pm.for_stage(
            "topic_init",
            evolution_overlay=_overlay,
            topic=topic,
            domains=domains,
            project_name=config.project.name,
            quality_threshold=config.research.quality_threshold,
        )
        resp = llm.chat(
            [{"role": "user", "content": sp.user}],
            system=sp.system,
        )
        goal_md = resp.content
    else:
        goal_md = f"""# Research Goal

## Topic
{topic}

## Scope
Investigate the topic with emphasis on reproducible methods and measurable outcomes.

## SMART Goal
- Specific: Build a focused research plan for {topic}
- Measurable: Produce literature shortlist, hypotheses, experiment plan, and final paper
- Achievable: Complete through staged pipeline with gate checks
- Relevant: Aligned with project {config.project.name}
- Time-bound: Constrained by pipeline execution budget

## Constraints
- Quality threshold: {config.research.quality_threshold}
- Daily paper target: {config.research.daily_paper_count}

## Success Criteria
- At least 2 falsifiable hypotheses
- Executable experiment code and results analysis
- Revised paper passing quality gate

## Generated
{_utcnow_iso()}
"""
    (stage_dir / "goal.md").write_text(goal_md, encoding="utf-8")

    # --- Hardware detection (GPU / MPS / CPU) ---
    hw = detect_hardware()
    (stage_dir / "hardware_profile.json").write_text(
        json.dumps(hw.to_dict(), indent=2), encoding="utf-8"
    )
    if hw.warning:
        logger.warning("Hardware advisory: %s", hw.warning)
    else:
        logger.info("Hardware detected: %s (%s, %s MB VRAM)", hw.gpu_name, hw.gpu_type, hw.vram_mb)

    # --- Optionally ensure PyTorch is available ---
    if hw.has_gpu and config.experiment.mode == "sandbox":
        torch_ok = ensure_torch_available(config.experiment.sandbox.python_path, hw.gpu_type)
        if torch_ok:
            logger.info("PyTorch is available for sandbox experiments")
        else:
            logger.warning("PyTorch could not be installed; sandbox will use CPU-only packages")
    elif hw.has_gpu and config.experiment.mode == "docker":
        logger.info("Docker sandbox: PyTorch pre-installed in container image")

    return StageResult(
        stage=Stage.TOPIC_INIT,
        status=StageStatus.DONE,
        artifacts=("goal.md", "hardware_profile.json"),
        evidence_refs=("stage-01/goal.md", "stage-01/hardware_profile.json"),
    )


def _execute_problem_decompose(
    stage_dir: Path,
    run_dir: Path,
    config: RCConfig,
    adapters: AdapterBundle,
    *,
    llm: LLMClient | None = None,
    prompts: PromptManager | None = None,
) -> StageResult:
    goal_text = _read_prior_artifact(run_dir, "goal.md") or ""
    if llm is not None:
        _pm = prompts or PromptManager()
        _overlay = _get_evolution_overlay(run_dir, "problem_decompose")
        sp = _pm.for_stage(
            "problem_decompose",
            evolution_overlay=_overlay,
            topic=config.research.topic,
            goal_text=goal_text,
        )
        resp = llm.chat(
            [{"role": "user", "content": sp.user}],
            system=sp.system,
        )
        body = resp.content
    else:
        body = f"""# Problem Decomposition

## Source
Derived from `goal.md` for topic: {config.research.topic}

## Sub-questions
1. Which problem settings and benchmarks define current SOTA?
2. Which methodological gaps remain unresolved?
3. Which hypotheses are testable under realistic constraints?
4. Which datasets and metrics best discriminate method quality?
5. Which failure modes can invalidate expected gains?

## Priority Ranking
1. Problem framing and benchmark setup
2. Gap identification and hypothesis formulation
3. Experiment and metric design
4. Failure analysis and robustness checks

## Risks
- Ambiguous task definition
- Dataset leakage or metric mismatch

## Generated
{_utcnow_iso()}
"""
    (stage_dir / "problem_tree.md").write_text(body, encoding="utf-8")

    # IMP-35: Topic/title quality pre-evaluation
    # Quick LLM check: is the topic well-scoped for a conference paper?
    if llm is not None:
        try:
            _eval_resp = llm.chat(
                [
                    {
                        "role": "user",
                        "content": (
                            "Evaluate this research topic for a top ML conference paper. "
                            "Score 1-10 on: (a) novelty, (b) specificity, (c) feasibility. "
                            "If overall score < 5, suggest a refined topic.\n\n"
                            f"Topic: {config.research.topic}\n\n"
                            "Reply as JSON: {\"novelty\": N, \"specificity\": N, "
                            "\"feasibility\": N, \"overall\": N, \"suggestion\": \"...\"}"
                        ),
                    }
                ],
                system=(
                    f"You are a senior {_detect_domain(config.research.topic, config.research.domains)[1]} "
                    f"researcher evaluating research topic quality."
                ),
            )
            _eval_data = _safe_json_loads(_eval_resp.content, {})
            if isinstance(_eval_data, dict):
                overall = _eval_data.get("overall", 10)
                if isinstance(overall, (int, float)) and overall < 5:
                    logger.warning(
                        "IMP-35: Topic quality score %s/10 — consider refining: %s",
                        overall,
                        _eval_data.get("suggestion", ""),
                    )
                else:
                    logger.info("IMP-35: Topic quality score %s/10", overall)
                (stage_dir / "topic_evaluation.json").write_text(
                    json.dumps(_eval_data, indent=2), encoding="utf-8"
                )
        except Exception:  # noqa: BLE001
            logger.debug("IMP-35: Topic evaluation skipped (non-blocking)")

    return StageResult(
        stage=Stage.PROBLEM_DECOMPOSE,
        status=StageStatus.DONE,
        artifacts=("problem_tree.md",),
        evidence_refs=("stage-02/problem_tree.md",),
    )


================================================
FILE: researchclaw/pipeline/stages.py
================================================
"""23-stage ResearchClaw pipeline state machine.

Defines the stage sequence, status transitions, gate logic, and rollback rules.
Migrated from arc/state_machine.py (19 stages) with the following changes:
  - SEARCH_PLAN + SOURCE_CONNECT → SEARCH_STRATEGY
  - RELEVANCE_SCREEN + QUALITY_SCREEN → LITERATURE_SCREEN
  - CLUSTER_TOPICS + GAP_ANALYSIS → SYNTHESIS
  - EXPERIMENT_DESIGN split → EXPERIMENT_DESIGN + CODE_GENERATION
  - EXECUTE split → EXPERIMENT_RUN + ITERATIVE_REFINE
  - WRITE_DRAFT split → PAPER_OUTLINE + PAPER_DRAFT
  - Added PAPER_REVISION, QUALITY_GATE, EXPORT_PUBLISH
  - RETROSPECTIVE_ARCHIVE split → KNOWLEDGE_ARCHIVE (+ QUALITY_GATE + EXPORT_PUBLISH)
"""

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum, IntEnum
from typing import Iterable


class Stage(IntEnum):
    """23-stage research pipeline."""

    # Phase A: Research Scoping
    TOPIC_INIT = 1
    PROBLEM_DECOMPOSE = 2

    # Phase B: Literature Discovery
    SEARCH_STRATEGY = 3
    LITERATURE_COLLECT = 4
    LITERATURE_SCREEN = 5  # GATE
    KNOWLEDGE_EXTRACT = 6

    # Phase C: Knowledge Synthesis
    SYNTHESIS = 7
    HYPOTHESIS_GEN = 8

    # Phase D: Experiment Design
    EXPERIMENT_DESIGN = 9  # GATE
    CODE_GENERATION = 10  # NEW
    RESOURCE_PLANNING = 11

    # Phase E: Experiment Execution
    EXPERIMENT_RUN = 12
    ITERATIVE_REFINE = 13  # NEW

    # Phase F: Analysis & Decision
    RESULT_ANALYSIS = 14
    RESEARCH_DECISION = 15

    # Phase G: Paper Writing
    PAPER_OUTLINE = 16
    PAPER_DRAFT = 17
    PEER_REVIEW = 18
    PAPER_REVISION = 19  # NEW

    # Phase H: Finalization
    QUALITY_GATE = 20  # GATE
    KNOWLEDGE_ARCHIVE = 21
    EXPORT_PUBLISH = 22
    CITATION_VERIFY = 23


class StageStatus(str, Enum):
    PENDING = "pending"
    RUNNING = "running"
    BLOCKED_APPROVAL = "blocked_approval"
    APPROVED = "approved"
    REJECTED = "rejected"
    PAUSED = "paused"
    RETRYING = "retrying"
    FAILED = "failed"
    DONE = "done"


class TransitionEvent(str, Enum):
    START = "start"
    SUCCEED = "succeed"
    APPROVE = "approve"
    REJECT = "reject"
    TIMEOUT = "timeout"
    FAIL = "fail"
    RETRY = "retry"
    RESUME = "resume"
    PAUSE = "pause"


# ---------------------------------------------------------------------------
# Stage navigation
# ---------------------------------------------------------------------------

STAGE_SEQUENCE: tuple[Stage, ...] = tuple(Stage)

NEXT_STAGE: dict[Stage, Stage | None] = {
    stage: STAGE_SEQUENCE[idx + 1] if idx + 1 < len(STAGE_SEQUENCE) else None
    for idx, stage in enumerate(STAGE_SEQUENCE)
}

PREVIOUS_STAGE: dict[Stage, Stage | None] = {
    stage: STAGE_SEQUENCE[idx - 1] if idx > 0 else None
    for idx, stage in enumerate(STAGE_SEQUENCE)
}

# ---------------------------------------------------------------------------
# Gate stages — require approval before proceeding
# ---------------------------------------------------------------------------

GATE_STAGES: frozenset[Stage] = frozenset(
    {
        Stage.LITERATURE_SCREEN,
        Stage.EXPERIMENT_DESIGN,
        Stage.QUALITY_GATE,
    }
)

# Gate rollback targets: when a gate rejects, where to roll back
GATE_ROLLBACK: dict[Stage, Stage] = {
    Stage.LITERATURE_SCREEN: Stage.LITERATURE_COLLECT,  # reject → re-collect
    Stage.EXPERIMENT_DESIGN: Stage.HYPOTHESIS_GEN,  # reject → re-hypothesize
    Stage.QUALITY_GATE: Stage.PAPER_OUTLINE,  # reject → rewrite paper
}

# ---------------------------------------------------------------------------
# Research decision rollback targets (PIVOT/REFINE from Stage 15)
# ---------------------------------------------------------------------------

DECISION_ROLLBACK: dict[str, Stage] = {
    "pivot": Stage.HYPOTHESIS_GEN,       # Discard hypotheses, re-generate
    "refine": Stage.ITERATIVE_REFINE,    # Keep hypotheses, re-run experiments
}

MAX_DECISION_PIVOTS: int = 2  # Prevent infinite loops

# ---------------------------------------------------------------------------
# Noncritical stages — can be skipped on failure without aborting pipeline
# ---------------------------------------------------------------------------

NONCRITICAL_STAGES: frozenset[Stage] = frozenset(
    {
        Stage.QUALITY_GATE,       # 20: low quality should warn, not block deliverables
        Stage.KNOWLEDGE_ARCHIVE,  # 21: archival doesn't affect paper output
        # T3.4: CITATION_VERIFY removed — hallucinated citations MUST block export
    }
)

# ---------------------------------------------------------------------------
# Phase groupings (for UI and reporting)
# ---------------------------------------------------------------------------

PHASE_MAP: dict[str, tuple[Stage, ...]] = {
    "A: Research Scoping": (Stage.TOPIC_INIT, Stage.PROBLEM_DECOMPOSE),
    "B: Literature Discovery": (
        Stage.SEARCH_STRATEGY,
        Stage.LITERATURE_COLLECT,
        Stage.LITERATURE_SCREEN,
        Stage.KNOWLEDGE_EXTRACT,
    ),
    "C: Knowledge Synthesis": (Stage.SYNTHESIS, Stage.HYPOTHESIS_GEN),
    "D: Experiment Design": (
        Stage.EXPERIMENT_DESIGN,
        Stage.CODE_GENERATION,
        Stage.RESOURCE_PLANNING,
    ),
    "E: Experiment Execution": (Stage.EXPERIMENT_RUN, Stage.ITERATIVE_REFINE),
    "F: Analysis & Decision": (Stage.RESULT_ANALYSIS, Stage.RESEARCH_DECISION),
    "G: Paper Writing": (
        Stage.PAPER_OUTLINE,
        Stage.PAPER_DRAFT,
        Stage.PEER_REVIEW,
        Stage.PAPER_REVISION,
    ),
    "H: Finalization": (
        Stage.QUALITY_GATE,
        Stage.KNOWLEDGE_ARCHIVE,
        Stage.EXPORT_PUBLISH,
        Stage.CITATION_VERIFY,
    ),
}


# ---------------------------------------------------------------------------
# Transition logic
# ---------------------------------------------------------------------------

TRANSITION_MAP: dict[StageStatus, frozenset[StageStatus]] = {
    StageStatus.PENDING: frozenset({StageStatus.RUNNING}),
    StageStatus.RUNNING: frozenset(
        {StageStatus.DONE, StageStatus.BLOCKED_APPROVAL, StageStatus.FAILED}
    ),
    StageStatus.BLOCKED_APPROVAL: frozenset(
        {StageStatus.APPROVED, StageStatus.REJECTED, StageStatus.PAUSED}
    ),
    StageStatus.APPROVED: frozenset({StageStatus.DONE}),
    StageStatus.REJECTED: frozenset({StageStatus.PENDING}),
    StageStatus.PAUSED: frozenset({StageStatus.RUNNING}),
    StageStatus.RETRYING: frozenset({StageStatus.RUNNING}),
    StageStatus.FAILED: frozenset({StageStatus.RETRYING, StageStatus.PAUSED}),
    StageStatus.DONE: frozenset(),
}


@dataclass(frozen=True)
class TransitionOutcome:
    stage: Stage
    status: StageStatus
    next_stage: Stage | None
    rollback_stage: Stage | None = None
    checkpoint_required: bool = False
    decision: str = "proceed"


def gate_required(
    stage: Stage,
    hitl_required_stages: Iterable[int] | None = None,
) -> bool:
    """Check whether a stage requires human-in-the-loop approval."""
    if stage not in GATE_STAGES:
        return False
    if hitl_required_stages is not None:
        return int(stage) in frozenset(hitl_required_stages)
    return True  # Default: all gate stages require approval


def default_rollback_stage(stage: Stage) -> Stage:
    """Return the configured rollback target, or the previous stage."""
    return GATE_ROLLBACK.get(stage) or PREVIOUS_STAGE.get(stage) or stage


def advance(
    stage: Stage,
    status: StageStatus,
    event: TransitionEvent | str,
    *,
    hitl_required_stages: Iterable[int] | None = None,
    rollback_stage: Stage | None = None,
) -> TransitionOutcome:
    """Compute the next state given current stage, status, and event.

    Raises ValueError on unsupported transitions.
    """
    event = TransitionEvent(event)
    target_rollback = rollback_stage or default_rollback_stage(stage)

    # START → RUNNING
    if event is TransitionEvent.START and status in {
        StageStatus.PENDING,
        StageStatus.RETRYING,
        StageStatus.PAUSED,
    }:
        return TransitionOutcome(
            stage=stage, status=StageStatus.RUNNING, next_stage=stage
        )

    # SUCCEED while RUNNING
    if event is TransitionEvent.SUCCEED and status is StageStatus.RUNNING:
        if gate_required(stage, hitl_required_stages):
            return TransitionOutcome(
                stage=stage,
                status=StageStatus.BLOCKED_APPROVAL,
                next_stage=stage,
                checkpoint_required=False,
                decision="block",
            )
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.DONE,
            next_stage=NEXT_STAGE[stage],
            checkpoint_required=True,
        )

    # APPROVE while BLOCKED
    if event is TransitionEvent.APPROVE and status is StageStatus.BLOCKED_APPROVAL:
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.DONE,
            next_stage=NEXT_STAGE[stage],
            checkpoint_required=True,
        )

    # REJECT while BLOCKED → rollback
    if event is TransitionEvent.REJECT and status is StageStatus.BLOCKED_APPROVAL:
        return TransitionOutcome(
            stage=target_rollback,
            status=StageStatus.PENDING,
            next_stage=target_rollback,
            rollback_stage=target_rollback,
            checkpoint_required=True,
            decision="pivot",
        )

    # TIMEOUT while BLOCKED → pause
    if event is TransitionEvent.TIMEOUT and status is StageStatus.BLOCKED_APPROVAL:
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.PAUSED,
            next_stage=stage,
            checkpoint_required=True,
            decision="block",
        )

    # FAIL while RUNNING
    if event is TransitionEvent.FAIL and status is StageStatus.RUNNING:
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.FAILED,
            next_stage=stage,
            checkpoint_required=True,
            decision="retry",
        )

    # RETRY while FAILED
    if event is TransitionEvent.RETRY and status is StageStatus.FAILED:
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.RETRYING,
            next_stage=stage,
            decision="retry",
        )

    # RESUME while PAUSED
    if event is TransitionEvent.RESUME and status is StageStatus.PAUSED:
        return TransitionOutcome(
            stage=stage, status=StageStatus.RUNNING, next_stage=stage
        )

    # PAUSE while FAILED
    if event is TransitionEvent.PAUSE and status is StageStatus.FAILED:
        return TransitionOutcome(
            stage=stage,
            status=StageStatus.PAUSED,
            next_stage=stage,
            checkpoint_required=True,
            decision="block",
        )

    raise ValueError(
        f"Unsupported transition: {status.value} + {event.value} for stage {int(stage)}"
    )


================================================
FILE: researchclaw/pipeline/verified_registry.py
================================================
"""Verified Value Registry — ground truth for all experiment-sourced numbers.

Builds a whitelist of numeric values, condition names, and training config
from ``experiment_summary.json`` and ``refinement_log.json``.  Used by
``paper_verifier.py`` and ``results_table_builder.py`` to ensure that
generated papers contain ONLY numbers grounded in real experiment data.
"""

from __future__ import annotations

import logging
import math
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

# Infrastructure metric keys — allowed in paper without verification
_INFRA_KEYS: set[str] = {
    "elapsed_sec",
    "total_elapsed_seconds",
    "TIME_ESTIMATE",
    "SEED_COUNT",
    "time_budget_sec",
    "condition_count",
    "total_runs",
    "total_conditions",
    "total_metric_keys",
    "stopped_early",
}

# Metric key patterns for per-seed results (e.g. "DQN/0/metric")
_PER_SEED_PATTERN = re.compile(r"^(.+)/(\d+)/(.+)$")


@dataclass
class ConditionResult:
    """Aggregated results for one experimental condition."""

    name: str
    per_seed_values: dict[int, float] = field(default_factory=dict)
    mean: float | None = None
    std: float | None = None
    n_seeds: int = 0
    aggregate_metric: float | None = None  # The condition-level metric

    def compute_stats(self) -> None:
        """Compute mean and std from per-seed values."""
        vals = [v for v in self.per_seed_values.values() if _is_finite(v)]
        self.n_seeds = len(vals)
        if not vals:
            return
        self.mean = sum(vals) / len(vals)
        if len(vals) >= 2:
            variance = sum((v - self.mean) ** 2 for v in vals) / (len(vals) - 1)
            self.std = math.sqrt(variance)
        else:
            self.std = 0.0


@dataclass
class VerifiedRegistry:
    """Registry of all numbers grounded in experiment data."""

    values: dict[float, str] = field(default_factory=dict)
    condition_names: set[str] = field(default_factory=set)
    conditions: dict[str, ConditionResult] = field(default_factory=dict)
    primary_metric: float | None = None
    primary_metric_std: float | None = None
    metric_direction: str = "maximize"  # "maximize" or "minimize"
    training_config: dict[str, Any] = field(default_factory=dict)

    def add_value(self, value: float, source: str) -> None:
        """Register a verified numeric value with its provenance."""
        if not _is_finite(value):
            return
        self.values[value] = source
        # Also register common transformations
        self._add_variants(value, source)

    def _add_variants(self, value: float, source: str) -> None:
        """Register rounding variants and percentage conversions."""
        # Rounded variants (2, 3, 4 decimal places)
        for dp in (1, 2, 3, 4):
            rounded = round(value, dp)
            if rounded != value and rounded not in self.values:
                self.values[rounded] = f"{source} (rounded to {dp}dp)"

        # Percentage conversion: if value is in [0, 1], also register value*100
        if 0.0 < abs(value) <= 1.0:
            pct = value * 100.0
            if pct not in self.values:
                self.values[pct] = f"{source} (×100)"
                for dp in (1, 2, 3, 4):
                    pct_r = round(pct, dp)
                    if pct_r not in self.values:
                        self.values[pct_r] = f"{source} (×100, {dp}dp)"

        # If value > 1 and could be a percentage, also register value/100
        if abs(value) > 1.0:
            frac = value / 100.0
            if frac not in self.values:
                self.values[frac] = f"{source} (÷100)"

    def is_verified(self, number: float, tolerance: float = 0.01) -> bool:
        """Check if *number* matches any verified value within relative tolerance."""
        if not _is_finite(number):
            return False
        for v in self.values:
            if v == 0.0:
                if abs(number) < 1e-6:
                    return True
            elif abs(number - v) / max(abs(v), 1e-9) <= tolerance:
                return True
        return False

    def lookup(self, number: float, tolerance: float = 0.01) -> str | None:
        """Return the source description if *number* is verified, else None."""
        if not _is_finite(number):
            return None
        for v, src in self.values.items():
            if v == 0.0:
                if abs(number) < 1e-6:
                    return src
            elif abs(number - v) / max(abs(v), 1e-9) <= tolerance:
                return src
        return None

    def verify_condition(self, name: str) -> bool:
        """Check if condition name was actually run."""
        return name in self.condition_names

    @classmethod
    def from_experiment(
        cls,
        experiment_summary: dict,
        refinement_log: dict | None = None,
        *,
        metric_direction: str = "maximize",
    ) -> VerifiedRegistry:
        """Build registry from experiment artifacts.

        Parameters
        ----------
        experiment_summary:
            Parsed ``experiment_summary.json``.
        refinement_log:
            Parsed ``refinement_log.json`` (optional, provides richer per-seed data).
        metric_direction:
            ``"maximize"`` or ``"minimize"`` — used for best-result detection.
        """
        reg = cls(metric_direction=metric_direction)

        # --- 1. Extract condition-level and per-seed metrics ---
        best_run = experiment_summary.get("best_run", {})
        metrics = best_run.get("metrics", {})

        # Parse per-seed structure: "CondName/seed/metric_key" → value
        for key, value in metrics.items():
            if not isinstance(value, (int, float)) or not _is_finite(value):
                continue
            if key in _INFRA_KEYS:
                reg.training_config[key] = value
                continue

            reg.add_value(value, f"best_run.metrics.{key}")

            m = _PER_SEED_PATTERN.match(key)
            if m:
                cond_name, seed_str, _metric_name = m.group(1), m.group(2), m.group(3)
                seed_idx = int(seed_str)
                if cond_name not in reg.conditions:
                    reg.conditions[cond_name] = ConditionResult(name=cond_name)
                reg.conditions[cond_name].per_seed_values[seed_idx] = value
                reg.condition_names.add(cond_name)

        # --- 2. Extract condition_summaries ---
        for cond_name, cond_data in experiment_summary.get("condition_summaries", {}).items():
            reg.condition_names.add(cond_name)
            if cond_name not in reg.conditions:
                reg.conditions[cond_name] = ConditionResult(name=cond_name)
            cond_metrics = cond_data.get("metrics", {})
            for mk, mv in cond_metrics.items():
                if isinstance(mv, (int, float)) and _is_finite(mv):
                    reg.add_value(mv, f"condition_summaries.{cond_name}.{mk}")
                    reg.conditions[cond_name].aggregate_metric = mv

        # --- 3. Extract metrics_summary (min/max/mean per key) ---
        for key, stats in experiment_summary.get("metrics_summary", {}).items():
            if key in _INFRA_KEYS:
                continue
            for stat_name in ("min", "max", "mean"):
                v = stats.get(stat_name)
                if isinstance(v, (int, float)) and _is_finite(v):
                    reg.add_value(v, f"metrics_summary.{key}.{stat_name}")

        # --- 4. Extract primary_metric ---
        pm = _extract_primary_metric(metrics)
        if pm is not None:
            reg.primary_metric = pm
            reg.add_value(pm, "primary_metric")
        pm_std = metrics.get("primary_metric_std")
        if isinstance(pm_std, (int, float)) and _is_finite(pm_std):
            reg.primary_metric_std = pm_std
            reg.add_value(pm_std, "primary_metric_std")

        # --- 5. Compute per-condition stats ---
        for cond in reg.conditions.values():
            cond.compute_stats()
            if cond.mean is not None:
                reg.add_value(cond.mean, f"{cond.name}.mean")
            if cond.std is not None and cond.std > 0:
                reg.add_value(cond.std, f"{cond.name}.std")

        # --- 6. Compute pairwise differences (for comparative claims) ---
        cond_list = [c for c in reg.conditions.values() if c.mean is not None]
        for i, c1 in enumerate(cond_list):
            for c2 in cond_list[i + 1 :]:
                diff = c1.mean - c2.mean  # type: ignore[operator]
                if _is_finite(diff):
                    reg.add_value(diff, f"diff({c1.name}-{c2.name})")
                    reg.add_value(abs(diff), f"|diff({c1.name},{c2.name})|")
                # Relative improvement
                if c2.mean and abs(c2.mean) > 1e-9:  # type: ignore[operator]
                    rel = (c1.mean - c2.mean) / abs(c2.mean) * 100.0  # type: ignore[operator]
                    if _is_finite(rel):
                        reg.add_value(rel, f"rel_improve({c1.name} vs {c2.name})")
                        reg.add_value(abs(rel), f"|rel_improve({c1.name},{c2.name})|")

        # --- 7. Enrich from refinement_log (best iteration only) ---
        if refinement_log:
            _enrich_from_refinement_log(reg, refinement_log)

        logger.info(
            "VerifiedRegistry: %d values, %d conditions (%s), primary_metric=%s",
            len(reg.values),
            len(reg.condition_names),
            ", ".join(sorted(reg.condition_names)),
            reg.primary_metric,
        )
        return reg

    @classmethod
    def from_run_dir(
        cls,
        run_dir: Path,
        *,
        metric_direction: str = "maximize",
        best_only: bool = False,
    ) -> VerifiedRegistry:
        """Build registry from experiment data sources in *run_dir*.

        Parameters
        ----------
        best_only:
            BUG-222: When True, use ONLY ``experiment_summary_best.json``
            (the promoted best iteration) as the ground truth.  This prevents
            regressed REFINE iterations from polluting the verified value set.
            When False (default), merges all ``stage-14*`` data for backward
            compatibility (e.g., pre-built table generation that needs all
            condition names).

        Scans (when ``best_only=False``):
        1. All ``stage-14*/experiment_summary.json`` (sorted, every version)
        2. ``experiment_summary_best.json`` at run root (repair cycle output)
        3. All ``stage-13*/refinement_log.json`` for enrichment
        """
        import json as _json_rd

        target = cls(metric_direction=metric_direction)

        if best_only:
            # BUG-222: Only use promoted best data
            best_path = run_dir / "experiment_summary_best.json"
            if best_path.is_file():
                try:
                    best_data = _json_rd.loads(best_path.read_text(encoding="utf-8"))
                    if isinstance(best_data, dict):
                        sub = cls.from_experiment(best_data, metric_direction=metric_direction)
                        _merge_into(target, sub)
                        logger.debug("from_run_dir(best_only): using experiment_summary_best.json (%d values)", len(sub.values))
                except (OSError, _json_rd.JSONDecodeError, Exception):  # noqa: BLE001
                    logger.debug("from_run_dir(best_only): failed to load experiment_summary_best.json", exc_info=True)
            if not target.values:
                # Fallback: no best.json or it was empty — use stage-14/ (non-versioned)
                s14_path = run_dir / "stage-14" / "experiment_summary.json"
                if s14_path.is_file():
                    try:
                        es_data = _json_rd.loads(s14_path.read_text(encoding="utf-8"))
                        if isinstance(es_data, dict):
                            sub = cls.from_experiment(es_data, metric_direction=metric_direction)
                            _merge_into(target, sub)
                    except (OSError, _json_rd.JSONDecodeError, Exception):  # noqa: BLE001
                        pass
        else:
            # --- 1. All stage-14* experiment summaries ---
            for es_path in sorted(run_dir.glob("stage-14*/experiment_summary.json")):
                try:
                    es_data = _json_rd.loads(es_path.read_text(encoding="utf-8"))
                    if not isinstance(es_data, dict):
                        continue
                    sub = cls.from_experiment(es_data, metric_direction=metric_direction)
                    _merge_into(target, sub)
                    logger.debug("from_run_dir: merged %s (%d values)", es_path.name, len(sub.values))
                except (OSError, _json_rd.JSONDecodeError, Exception):  # noqa: BLE001
                    logger.debug("from_run_dir: skipping %s", es_path, exc_info=True)

            # --- 2. experiment_summary_best.json (repair cycle output) ---
            best_path = run_dir / "experiment_summary_best.json"
            if best_path.is_file():
                try:
                    best_data = _json_rd.loads(best_path.read_text(encoding="utf-8"))
                    if isinstance(best_data, dict):
                        sub = cls.from_experiment(best_data, metric_direction=metric_direction)
                        _merge_into(target, sub)
                        logger.debug("from_run_dir: merged experiment_summary_best.json (%d values)", len(sub.values))
                except (OSError, _json_rd.JSONDecodeError, Exception):  # noqa: BLE001
                    logger.debug("from_run_dir: skipping experiment_summary_best.json", exc_info=True)

            # --- 3. All refinement logs (enrichment) ---
            for rl_path in sorted(run_dir.glob("stage-13*/refinement_log.json")):
                try:
                    rl_data = _json_rd.loads(rl_path.read_text(encoding="utf-8"))
                    if isinstance(rl_data, dict):
                        _enrich_from_refinement_log(target, rl_data)
                        logger.debug("from_run_dir: enriched from %s", rl_path.name)
                except (OSError, _json_rd.JSONDecodeError, Exception):  # noqa: BLE001
                    logger.debug("from_run_dir: skipping %s", rl_path, exc_info=True)

        # Recompute per-condition stats after merging
        for cond in target.conditions.values():
            cond.compute_stats()
            if cond.mean is not None:
                target.add_value(cond.mean, f"{cond.name}.mean")
            if cond.std is not None and cond.std > 0:
                target.add_value(cond.std, f"{cond.name}.std")

        logger.info(
            "VerifiedRegistry.from_run_dir(%s): %d values, %d conditions (%s)",
            "best_only" if best_only else "all",
            len(target.values),
            len(target.condition_names),
            ", ".join(sorted(target.condition_names)) if target.condition_names else "none",
        )
        return target

    @classmethod
    def from_files(
        cls,
        experiment_summary_path: Path,
        refinement_log_path: Path | None = None,
        *,
        metric_direction: str = "maximize",
    ) -> VerifiedRegistry:
        """Convenience: build registry from file paths."""
        import json

        exp_data = json.loads(experiment_summary_path.read_text(encoding="utf-8"))
        ref_data = None
        if refinement_log_path and refinement_log_path.exists():
            ref_data = json.loads(refinement_log_path.read_text(encoding="utf-8"))
        return cls.from_experiment(exp_data, ref_data, metric_direction=metric_direction)


def _merge_into(target: VerifiedRegistry, source: VerifiedRegistry) -> None:
    """Merge *source* values, conditions, and condition_names into *target*."""
    for v, desc in source.values.items():
        if v not in target.values:
            target.values[v] = desc
    target.condition_names |= source.condition_names
    for cname, cresult in source.conditions.items():
        if cname not in target.conditions:
            target.conditions[cname] = ConditionResult(name=cname)
        existing = target.conditions[cname]
        # Merge per-seed values (source wins on conflict — later data is better)
        existing.per_seed_values.update(cresult.per_seed_values)
        if cresult.aggregate_metric is not None:
            existing.aggregate_metric = cresult.aggregate_metric
    # Keep the best primary metric
    if source.primary_metric is not None:
        if target.primary_metric is None:
            target.primary_metric = source.primary_metric
        elif target.metric_direction == "maximize":
            target.primary_metric = max(target.primary_metric, source.primary_metric)
        else:
            target.primary_metric = min(target.primary_metric, source.primary_metric)
    if source.primary_metric_std is not None:
        # Only update std if the source's primary_metric actually won
        if target.primary_metric == source.primary_metric:
            target.primary_metric_std = source.primary_metric_std
    target.training_config.update(source.training_config)


def _enrich_from_refinement_log(reg: VerifiedRegistry, refinement_log: dict) -> None:
    """Add values from the best refinement iteration."""
    best_metric = refinement_log.get("best_metric")
    if isinstance(best_metric, (int, float)) and _is_finite(best_metric):
        reg.add_value(best_metric, "refinement_log.best_metric")

    best_version = refinement_log.get("best_version", "")
    iterations = refinement_log.get("iterations", [])

    for it in iterations:
        ver = it.get("version_dir", "")
        metric = it.get("metric")
        if isinstance(metric, (int, float)) and _is_finite(metric):
            reg.add_value(metric, f"refinement_log.iteration.{ver}")

        # Extract per-seed values from sandbox stdout if available
        for sandbox_key in ("sandbox", "sandbox_after_fix"):
            sandbox = it.get(sandbox_key, {})
            if not isinstance(sandbox, dict):
                continue
            sb_metrics = sandbox.get("metrics", {})
            if isinstance(sb_metrics, dict):
                for mk, mv in sb_metrics.items():
                    if isinstance(mv, (int, float)) and _is_finite(mv) and mk not in _INFRA_KEYS:
                        reg.add_value(mv, f"refinement.{ver}.{sandbox_key}.{mk}")

                        # Parse per-seed keys here too
                        m = _PER_SEED_PATTERN.match(mk)
                        if m:
                            cond_name = m.group(1)
                            seed_idx = int(m.group(2))
                            reg.condition_names.add(cond_name)
                            if cond_name not in reg.conditions:
                                reg.conditions[cond_name] = ConditionResult(name=cond_name)
                            # Only update per_seed if this is the best version
                            if ver == best_version or best_version in ver:
                                reg.conditions[cond_name].per_seed_values[seed_idx] = mv


def _extract_primary_metric(metrics: dict) -> float | None:
    """Extract primary_metric from metrics dict."""
    pm = metrics.get("primary_metric")
    if isinstance(pm, (int, float)) and _is_finite(pm):
        return float(pm)
    return None


def _is_finite(value: Any) -> bool:
    """Check if value is a finite number (not NaN, not Inf, not bool)."""
    if isinstance(value, bool):
        return False
    if not isinstance(value, (int, float)):
        return False
    return math.isfinite(value)


================================================
FILE: researchclaw/project/__init__.py
================================================
"""Multi-project management for AutoResearchClaw."""

from researchclaw.project.models import Idea, Project
from researchclaw.project.manager import ProjectManager
from researchclaw.project.scheduler import ProjectScheduler
from researchclaw.project.idea_pool import IdeaPool

__all__ = ["Idea", "Project", "ProjectManager", "ProjectScheduler", "IdeaPool"]


================================================
FILE: researchclaw/project/idea_pool.py
================================================
"""Idea pool: collect, evaluate, rank, and convert research ideas to projects."""

from __future__ import annotations

import json
import logging
import uuid
from pathlib import Path
from typing import Any

from researchclaw.project.models import Idea, Project

logger = logging.getLogger(__name__)


class IdeaPool:
    """Manage a pool of research ideas with evaluation and ranking."""

    def __init__(self, pool_path: str | Path) -> None:
        self.pool_path = Path(pool_path).expanduser().resolve()
        self.ideas: dict[str, Idea] = {}
        self._load()

    # ── persistence ───────────────────────────────────────────────

    def _load(self) -> None:
        if not self.pool_path.exists():
            return
        try:
            data = json.loads(self.pool_path.read_text(encoding="utf-8"))
            for entry in data.get("ideas", []):
                idea = Idea.from_dict(entry)
                self.ideas[idea.id] = idea
        except (json.JSONDecodeError, KeyError) as exc:
            logger.warning("Failed to load idea pool: %s", exc)

    def _save(self) -> None:
        self.pool_path.parent.mkdir(parents=True, exist_ok=True)
        data = {"ideas": [idea.to_dict() for idea in self.ideas.values()]}
        self.pool_path.write_text(
            json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
        )

    # ── CRUD ──────────────────────────────────────────────────────

    def add(self, title: str, description: str, domains: list[str] | None = None) -> Idea:
        """Add a new idea to the pool."""
        idea_id = uuid.uuid4().hex[:8]
        idea = Idea(
            id=idea_id,
            title=title,
            description=description,
            domains=domains or [],
        )
        self.ideas[idea_id] = idea
        self._save()
        logger.info("Added idea %s: %s", idea_id, title)
        return idea

    def remove(self, idea_id: str) -> None:
        """Remove an idea from the pool."""
        if idea_id not in self.ideas:
            raise KeyError(f"Unknown idea: {idea_id}")
        del self.ideas[idea_id]
        self._save()

    def get(self, idea_id: str) -> Idea:
        """Get an idea by ID."""
        if idea_id not in self.ideas:
            raise KeyError(f"Unknown idea: {idea_id}")
        return self.ideas[idea_id]

    # ── evaluation ────────────────────────────────────────────────

    def evaluate(self, idea_id: str, feasibility: float, novelty: float) -> dict[str, Any]:
        """Set feasibility and novelty scores for an idea."""
        idea = self.get(idea_id)
        idea.feasibility = max(0.0, min(1.0, feasibility))
        idea.novelty = max(0.0, min(1.0, novelty))
        idea.status = "evaluated"
        self._save()
        return {
            "id": idea.id,
            "feasibility": idea.feasibility,
            "novelty": idea.novelty,
            "score": idea.score,
        }

    def rank(self) -> list[Idea]:
        """Return all ideas sorted by composite score (descending)."""
        return sorted(self.ideas.values(), key=lambda i: i.score, reverse=True)

    # ── conversion ────────────────────────────────────────────────

    def to_project(self, idea_id: str, config_path: str, projects_dir: str | Path) -> Project:
        """Convert an idea into a project skeleton."""
        idea = self.get(idea_id)
        from researchclaw.project.manager import ProjectManager

        manager = ProjectManager(projects_dir)
        project = manager.create(
            name=idea.title.lower().replace(" ", "_")[:40],
            config_path=config_path,
            topic=idea.description,
        )
        idea.status = "planned"
        self._save()
        return project

    def list_all(self) -> list[Idea]:
        """Return all ideas sorted by creation time."""
        return sorted(self.ideas.values(), key=lambda i: i.created_at)


================================================
FILE: researchclaw/project/manager.py
================================================
"""Project manager: CRUD operations and status tracking for research projects."""

from __future__ import annotations

import json
import logging
import shutil
from pathlib import Path
from typing import Any

from researchclaw.project.models import Project

logger = logging.getLogger(__name__)

_REGISTRY_FILE = "registry.json"


class ProjectManager:
    """Manage multiple research projects with independent directories and configs."""

    def __init__(self, projects_dir: str | Path) -> None:
        self.projects_dir = Path(projects_dir).expanduser().resolve()
        self.projects: dict[str, Project] = {}
        self._active: str | None = None
        self._load_registry()

    # ── persistence ───────────────────────────────────────────────

    def _registry_path(self) -> Path:
        return self.projects_dir / _REGISTRY_FILE

    def _load_registry(self) -> None:
        """Load project registry from disk."""
        path = self._registry_path()
        if not path.exists():
            return
        try:
            data = json.loads(path.read_text(encoding="utf-8"))
            for entry in data.get("projects", []):
                proj = Project.from_dict(entry)
                self.projects[proj.name] = proj
            self._active = data.get("active")
        except (json.JSONDecodeError, KeyError) as exc:
            logger.warning("Failed to load project registry: %s", exc)

    def _save_registry(self) -> None:
        """Persist project registry to disk."""
        self.projects_dir.mkdir(parents=True, exist_ok=True)
        data = {
            "active": self._active,
            "projects": [p.to_dict() for p in self.projects.values()],
        }
        self._registry_path().write_text(
            json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
        )

    # ── CRUD ──────────────────────────────────────────────────────

    def create(
        self,
        name: str,
        config_path: str,
        topic: str | None = None,
    ) -> Project:
        """Create a new project with an independent directory and config copy."""
        if name in self.projects:
            raise ValueError(f"Project already exists: {name}")

        project_dir = self.projects_dir / name
        project_dir.mkdir(parents=True, exist_ok=True)

        # Copy config to project directory
        src = Path(config_path).expanduser().resolve()
        if src.exists():
            dst = project_dir / "config.yaml"
            shutil.copy2(src, dst)
            stored_config = str(dst)
        else:
            stored_config = config_path

        run_dir = str(project_dir / "artifacts")
        Path(run_dir).mkdir(parents=True, exist_ok=True)

        project = Project(
            name=name,
            config_path=stored_config,
            run_dir=run_dir,
            topic=topic or "",
        )
        self.projects[name] = project
        if self._active is None:
            self._active = name
        self._save_registry()
        logger.info("Created project: %s", name)
        return project

    def delete(self, name: str) -> None:
        """Remove project from registry. Does NOT delete artifacts on disk."""
        if name not in self.projects:
            raise KeyError(f"Unknown project: {name}")
        del self.projects[name]
        if self._active == name:
            self._active = next(iter(self.projects), None)
        self._save_registry()
        logger.info("Deleted project (registry only): %s", name)

    def get(self, name: str) -> Project:
        """Get a single project by name."""
        if name not in self.projects:
            raise KeyError(f"Unknown project: {name}")
        return self.projects[name]

    def list_all(self) -> list[Project]:
        """Return all projects sorted by creation time."""
        return sorted(self.projects.values(), key=lambda p: p.created_at)

    def get_status(self) -> dict[str, Any]:
        """Summary of all project statuses."""
        projects = self.list_all()
        return {
            "total": len(projects),
            "active": self._active,
            "by_status": _count_by(projects, "status"),
            "projects": [
                {"name": p.name, "status": p.status, "topic": p.topic}
                for p in projects
            ],
        }

    # ── project switching ─────────────────────────────────────────

    def switch(self, name: str) -> Project:
        """Set the active project."""
        if name not in self.projects:
            raise KeyError(f"Unknown project: {name}")
        self._active = name
        self._save_registry()
        return self.projects[name]

    @property
    def active(self) -> Project | None:
        """Currently active project."""
        if self._active and self._active in self.projects:
            return self.projects[self._active]
        return None

    # ── comparison ────────────────────────────────────────────────

    def compare(self, name_a: str, name_b: str) -> dict[str, Any]:
        """Compare metrics and status of two projects."""
        a = self.get(name_a)
        b = self.get(name_b)
        return {
            "project_a": {"name": a.name, "status": a.status, "topic": a.topic, "metrics": a.metrics},
            "project_b": {"name": b.name, "status": b.status, "topic": b.topic, "metrics": b.metrics},
            "metric_diff": _metric_diff(a.metrics, b.metrics),
        }

    # ── run lifecycle ─────────────────────────────────────────────

    def start_run(self, name: str, run_id: str) -> str:
        """Mark a project as running with a new run ID."""
        proj = self.get(name)
        proj.status = "running"
        proj.last_run_id = run_id
        self._save_registry()
        return run_id

    def finish_run(self, name: str, status: str, metrics: dict[str, Any] | None = None) -> None:
        """Mark a project run as completed or failed."""
        proj = self.get(name)
        proj.status = status
        if metrics:
            proj.metrics = metrics
        self._save_registry()


def _count_by(projects: list[Project], attr: str) -> dict[str, int]:
    counts: dict[str, int] = {}
    for p in projects:
        val = getattr(p, attr, "unknown")
        counts[val] = counts.get(val, 0) + 1
    return counts


def _metric_diff(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
    all_keys = set(a) | set(b)
    diff: dict[str, Any] = {}
    for key in sorted(all_keys):
        va, vb = a.get(key), b.get(key)
        if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
            diff[key] = {"a": va, "b": vb, "delta": round(vb - va, 6)}
        else:
            diff[key] = {"a": va, "b": vb}
    return diff


================================================
FILE: researchclaw/project/models.py
================================================
"""Data models for multi-project management."""

from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any


@dataclass
class Project:
    """A research project managed by AutoResearchClaw."""

    name: str
    config_path: str
    run_dir: str
    status: str = "idle"  # idle | running | completed | failed
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    last_run_id: str | None = None
    topic: str = ""
    metrics: dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict[str, Any]:
        """Serialize project to a dictionary."""
        return {
            "name": self.name,
            "config_path": self.config_path,
            "run_dir": self.run_dir,
            "status": self.status,
            "created_at": self.created_at.isoformat(),
            "last_run_id": self.last_run_id,
            "topic": self.topic,
            "metrics": self.metrics,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Project:
        """Deserialize project from a dictionary."""
        created_at = data.get("created_at")
        if isinstance(created_at, str):
            created_at = datetime.fromisoformat(created_at)
        elif created_at is None:
            created_at = datetime.now(timezone.utc)
        return cls(
            name=data["name"],
            config_path=data["config_path"],
            run_dir=data["run_dir"],
            status=data.get("status", "idle"),
            created_at=created_at,
            last_run_id=data.get("last_run_id"),
            topic=data.get("topic", ""),
            metrics=data.get("metrics", {}),
        )


@dataclass
class Idea:
    """A research idea that can be evaluated and converted to a project."""

    id: str
    title: str
    description: str
    status: str = "draft"  # draft | evaluated | planned | running | completed
    feasibility: float = 0.0  # 0-1
    novelty: float = 0.0  # 0-1
    domains: list[str] = field(default_factory=list)
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))

    @property
    def score(self) -> float:
        """Composite score: weighted average of feasibility and novelty."""
        return 0.4 * self.feasibility + 0.6 * self.novelty

    def to_dict(self) -> dict[str, Any]:
        """Serialize idea to a dictionary."""
        return {
            "id": self.id,
            "title": self.title,
            "description": self.description,
            "status": self.status,
            "feasibility": self.feasibility,
            "novelty": self.novelty,
            "domains": self.domains,
            "created_at": self.created_at.isoformat(),
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Idea:
        """Deserialize idea from a dictionary."""
        created_at = data.get("created_at")
        if isinstance(created_at, str):
            created_at = datetime.fromisoformat(created_at)
        elif created_at is None:
            created_at = datetime.now(timezone.utc)
        return cls(
            id=data["id"],
            title=data["title"],
            description=data["description"],
            status=data.get("status", "draft"),
            feasibility=float(data.get("feasibility", 0.0)),
            novelty=float(data.get("novelty", 0.0)),
            domains=data.get("domains", []),
            created_at=created_at,
        )


================================================
FILE: researchclaw/project/scheduler.py
================================================
"""Project scheduler: priority queue and concurrency control for pipeline runs."""

from __future__ import annotations

import heapq
import logging
from dataclasses import dataclass, field
from typing import Any

from researchclaw.project.manager import ProjectManager

logger = logging.getLogger(__name__)


@dataclass(order=True)
class _QueueEntry:
    """Priority queue entry (lower priority number = higher priority)."""

    priority: int
    project_name: str = field(compare=False)


class ProjectScheduler:
    """Schedule project pipeline runs with priority and concurrency limits."""

    def __init__(self, manager: ProjectManager, max_concurrent: int = 2) -> None:
        self.manager = manager
        self.max_concurrent = max_concurrent
        self._queue: list[_QueueEntry] = []
        self._running: set[str] = set()

    def enqueue(self, project_name: str, priority: int = 0) -> None:
        """Add a project to the run queue."""
        if project_name not in self.manager.projects:
            raise KeyError(f"Unknown project: {project_name}")
        # Avoid duplicate enqueue
        for entry in self._queue:
            if entry.project_name == project_name:
                logger.info("Project %s already in queue", project_name)
                return
        if project_name in self._running:
            logger.info("Project %s already running", project_name)
            return
        heapq.heappush(self._queue, _QueueEntry(priority=priority, project_name=project_name))
        logger.info("Enqueued project %s with priority %d", project_name, priority)

    def dequeue(self) -> str | None:
        """Remove and return the highest-priority project from the queue."""
        if not self._queue:
            return None
        entry = heapq.heappop(self._queue)
        return entry.project_name

    def next(self) -> str | None:
        """Get the next project that should run, if a slot is available."""
        if not self.can_start():
            return None
        name = self.dequeue()
        if name is not None:
            self._running.add(name)
        return name

    def can_start(self) -> bool:
        """Check whether there is capacity to start another run."""
        return len(self._running) < self.max_concurrent and len(self._queue) > 0

    def mark_done(self, project_name: str) -> None:
        """Mark a running project as finished (frees a concurrency slot)."""
        self._running.discard(project_name)

    @property
    def queue_size(self) -> int:
        """Number of projects waiting in the queue."""
        return len(self._queue)

    @property
    def running_count(self) -> int:
        """Number of projects currently running."""
        return len(self._running)

    def get_status(self) -> dict[str, Any]:
        """Scheduler status overview."""
        return {
            "max_concurrent": self.max_concurrent,
            "running": sorted(self._running),
            "running_count": len(self._running),
            "queued": [e.project_name for e in sorted(self._queue)],
            "queue_size": len(self._queue),
        }


================================================
FILE: researchclaw/prompts.py
================================================
"""Prompt externalization for the ResearchClaw pipeline.

All 23 stage prompts are defined here as defaults and can be overridden
via a user-provided YAML file.  Users customize prompts without touching
Python source code.

Architecture
------------
* ``_DEFAULT_STAGES`` — every LLM-facing prompt, keyed by stage name.
* ``_DEFAULT_BLOCKS`` — reusable prompt fragments (topic constraint, etc.).
* ``_DEFAULT_SUB_PROMPTS`` — secondary prompts (code repair, etc.).
* ``PromptManager`` — loads defaults → merges user overrides → renders templates.
* ``_render()`` — safe ``{variable}`` substitution that leaves unmatched
  patterns (JSON schemas, curly-brace literals) untouched.

Usage
-----
::

    from researchclaw.prompts import PromptManager

    pm = PromptManager()                           # defaults only
    pm = PromptManager("my_prompts.yaml")          # with user overrides

    sp = pm.for_stage("topic_init", topic="RL for drug discovery", domains="ml, bio")
    resp = llm.chat(
        [{"role": "user", "content": sp.user}],
        system=sp.system,
        json_mode=sp.json_mode,
        max_tokens=sp.max_tokens,
    )
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Template rendering
# ---------------------------------------------------------------------------


def _render(template: str, variables: dict[str, str]) -> str:
    """Replace ``{var_name}`` placeholders with *variables* values.

    Only bare ``{word_chars}`` tokens are substituted — JSON schema
    examples like ``{candidates:[...]}`` or ``{score_1_to_10:number}``
    are left untouched because the regex requires the closing ``}``
    immediately after the identifier.
    """

    def _replacer(match: re.Match[str]) -> str:
        key = match.group(1)
        return str(variables[key]) if key in variables else match.group(0)

    return re.sub(r"\{(\w+)\}", _replacer, template)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class RenderedPrompt:
    """Fully rendered prompt ready for ``llm.chat()``."""

    system: str
    user: str
    json_mode: bool = False
    max_tokens: int | None = None


# ---------------------------------------------------------------------------
# PromptManager
# ---------------------------------------------------------------------------


class PromptManager:
    """Central registry for pipeline prompts with optional YAML overrides."""

    def __init__(self, overrides_path: str | Path | None = None) -> None:
        # Deep-copy defaults so mutations don't leak across instances
        self._stages: dict[str, dict[str, Any]] = {
            k: dict(v) for k, v in _DEFAULT_STAGES.items()
        }
        self._blocks: dict[str, str] = dict(_DEFAULT_BLOCKS)
        self._sub_prompts: dict[str, dict[str, Any]] = {
            k: dict(v) for k, v in _DEFAULT_SUB_PROMPTS.items()
        }
        if overrides_path:
            self._load_overrides(Path(overrides_path))

    # -- loading ----------------------------------------------------------

    def _load_overrides(self, path: Path) -> None:
        if not path.exists():
            logger.warning("Prompts file not found: %s — using defaults", path)
            return
        try:
            data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
        except yaml.YAMLError as exc:
            logger.warning("Bad prompts YAML %s: %s — using defaults", path, exc)
            return

        for stage_name, stage_data in (data.get("stages") or {}).items():
            if stage_name in self._stages and isinstance(stage_data, dict):
                self._stages[stage_name].update(stage_data)
            else:
                logger.warning("Unknown stage in prompts file: %s", stage_name)

        for block_name, block_text in (data.get("blocks") or {}).items():
            if isinstance(block_text, str):
                self._blocks[block_name] = block_text

        for sub_name, sub_data in (data.get("sub_prompts") or {}).items():
            if sub_name in self._sub_prompts and isinstance(sub_data, dict):
                self._sub_prompts[sub_name].update(sub_data)

        logger.info("Loaded prompt overrides from %s", path)

    # -- primary API ------------------------------------------------------

    def for_stage(
        self,
        stage: str,
        *,
        evolution_overlay: str = "",
        **kwargs: Any,
    ) -> RenderedPrompt:
        """Return a fully rendered prompt for *stage* with variables filled.

        If *evolution_overlay* is provided, it is appended to the user prompt
        so the LLM can learn from prior run lessons.
        """
        entry = self._stages[stage]
        kw = {k: str(v) for k, v in kwargs.items()}
        user_text = _render(entry["user"], kw)
        if evolution_overlay:
            user_text = f"{user_text}\n\n{evolution_overlay}"
        return RenderedPrompt(
            system=_render(entry["system"], kw),
            user=user_text,
            json_mode=entry.get("json_mode", False),
            max_tokens=entry.get("max_tokens"),
        )

    def system(self, stage: str) -> str:
        """Return the raw system prompt template for *stage*."""
        return self._stages[stage]["system"]

    def user(self, stage: str, **kwargs: Any) -> str:
        """Return the rendered user prompt for *stage*."""
        return _render(
            self._stages[stage]["user"],
            {k: str(v) for k, v in kwargs.items()},
        )

    def json_mode(self, stage: str) -> bool:
        return self._stages[stage].get("json_mode", False)

    def max_tokens(self, stage: str) -> int | None:
        return self._stages[stage].get("max_tokens")

    # -- blocks -----------------------------------------------------------

    def block(self, name: str, **kwargs: Any) -> str:
        """Render a reusable prompt block."""
        return _render(
            self._blocks[name],
            {k: str(v) for k, v in kwargs.items()},
        )

    # -- sub-prompts (code repair, etc.) ----------------------------------

    def sub_prompt(self, name: str, **kwargs: Any) -> RenderedPrompt:
        """Return a rendered sub-prompt (e.g. code_repair)."""
        entry = self._sub_prompts[name]
        kw = {k: str(v) for k, v in kwargs.items()}
        return RenderedPrompt(
            system=_render(entry["system"], kw),
            user=_render(entry["user"], kw),
        )

    # -- introspection ----------------------------------------------------

    def stage_names(self) -> list[str]:
        return list(self._stages.keys())

    def has_stage(self, stage: str) -> bool:
        return stage in self._stages

    def export_yaml(self, path: Path) -> None:
        """Write current prompts (defaults + overrides) to a YAML file."""
        data: dict[str, Any] = {
            "version": "1.0",
            "blocks": dict(self._blocks),
            "stages": {k: dict(v) for k, v in self._stages.items()},
            "sub_prompts": {k: dict(v) for k, v in self._sub_prompts.items()},
        }
        path.write_text(
            yaml.dump(data, default_flow_style=False, allow_unicode=True, width=120),
            encoding="utf-8",
        )


# ========================================================================
# DEFAULT PROMPTS — edit prompts.yaml to override; do NOT edit these.
# ========================================================================

# -- Canonical section word-count targets ----------------------------------
# Single source of truth for per-section word-count ranges.
# Used by executor._validate_draft_quality() and converter.check_paper_completeness().
SECTION_WORD_TARGETS: dict[str, tuple[int, int]] = {
    "abstract": (180, 220),
    "introduction": (800, 1000),
    "related work": (600, 800),
    "method": (1000, 1500),
    "experiments": (800, 1200),
    "results": (600, 800),
    "discussion": (400, 600),
    "limitations": (200, 300),
    "conclusion": (200, 300),
    "broader impact": (200, 400),
}

# Aliases mapping heading variants to canonical names in SECTION_WORD_TARGETS.
_SECTION_TARGET_ALIASES: dict[str, str] = {
    "methods": "method",
    "methodology": "method",
    "proposed method": "method",
    "approach": "method",
    "experimental setup": "experiments",
    "experimental results": "results",
    "results and discussion": "results",
    "results and analysis": "results",
    "conclusions": "conclusion",
    "conclusion and future work": "conclusion",
    "summary": "conclusion",
    "background": "related work",
    "literature review": "related work",
    "prior work": "related work",
    "limitation": "limitations",
    "limitations and future work": "limitations",
    "broader impacts": "broader impact",
    "societal impact": "broader impact",
    "ethical considerations": "broader impact",
}

# -- Reusable blocks -----------------------------------------------------

_DEFAULT_BLOCKS: dict[str, str] = {
    "title_guidelines": (
        "\n## TITLE RULES (Hard Constraints)\n"
        "1. MAXIMUM 14 words. Ideal: 8-12 words. NEVER exceed 14.\n"
        "2. Preferred structure: 'MethodName: Descriptive Phrase' (colon format)\n"
        "   - Create a catchy 1-3 word method name (acronym, portmanteau, or evocative word)\n"
        "   - Subtitle explains what it does: 'for X' / 'via Y' / 'in Z'\n"
        "   - Examples: 'AlphaEdit: Null-Space Knowledge Editing for LMs' (8 words)\n"
        "   - Examples: 'VAR: Visual Autoregressive Modeling via Next-Scale Prediction' (8 words)\n"
        "3. Alternative: Bold declarative claim that surprises the reader\n"
        "   - 'Not All Tokens Are What You Need for Pretraining' (9 words)\n"
        "   - 'Vision Transformers Need Registers' (4 words)\n"
        "4. FORBIDDEN patterns:\n"
        "   - 'Investigating...', 'An Empirical Study of...', 'Towards...'\n"
        "   - 'A Novel Approach to...', 'On the...' (generic academic filler)\n"
        "   - Repeating the full method description as title\n"
        "   - Weakness qualifiers: 'in Two Runs', 'Under Limited Data'\n"
        "5. MUST define a short method name (2-5 chars) that serves as memorable handle.\n"
        "   The reader should be able to say 'Have you read the X paper?'\n"
        "6. No abbreviations unless universally known (LLM, RL, GAN, NLP are OK).\n"
    ),
    "abstract_structure": (
        "\n## ABSTRACT (Hard Rules — 180-220 words, 5-7 sentences)\n"
        "STRUCTURE (PMR+ format):\n"
        "S1-S2: PROBLEM — What gap exists? Why does it matter? (NO method names yet)\n"
        "S3-S4: METHOD — Name your system. One-sentence description of key insight.\n"
        "S5-S6: RESULTS — At most 3 specific numbers. Use relative improvements\n"
        "  ('X% over baseline') not raw values ('0.7667'). Bold the single most\n"
        "  important result.\n"
        "S7 (optional): IMPACT — What does this enable?\n\n"
        "HARD CONSTRAINTS:\n"
        "- NO \\texttt{{}} in abstract\n"
        "- NO more than 3 numeric values in the entire abstract\n"
        "- NO per-seed breakdowns or confidence intervals\n"
        "- NO method names longer than 3 words (use the short system name)\n"
        "- The abstract must be readable by a researcher who skimmed only the title\n"
        "- First sentence must NOT start with 'We' or 'This paper'\n"
    ),
    "compute_budget": (
        "\n## Compute Budget Constraint\n"
        "- Total execution time limit: {time_budget_sec} seconds\n"
        "- You MUST design experiments that complete within this budget\n"
        "- Estimate: a simple numpy loop runs ~10M iterations/sec; a nested loop over\n"
        "  conditions runs proportionally slower\n"
        "- SCALING RULES (mandatory):\n"
        "  - If total conditions > 100: reduce seeds to 3-5 (not 20)\n"
        "  - If total conditions > 500: reduce to 2-3 representative conditions per factor\n"
        "  - If time_budget < 300s: limit total optimization steps to ≤5,000 per run\n"
        "  - If time_budget < 120s: limit total optimization steps to ≤1,000 per run\n"
        "  - Always print intermediate results so partial data is captured on timeout\n"
        "- MANDATORY: print a 'TIME_ESTIMATE: Xs' line before the main loop,\n"
        "  estimating total runtime based on a small pilot (run 1 condition, extrapolate)\n"
        "- MANDATORY: implement a time guard — check elapsed time periodically and\n"
        "  stop gracefully if approaching 80% of budget, saving all results collected so far\n"
        "- MANDATORY: add NaN/divergence fast-fail guard:\n"
        "  - After each optimization step, check if loss is NaN or > 100\n"
        "  - If detected, print 'FAIL: NaN/divergence detected', save partial results, and exit\n"
        "  - Do NOT waste compute on a diverging run\n"
        "- MINIMUM TRAINING EPOCHS (CRITICAL for meaningful results):\n"
        "  - CIFAR-10/100 with ResNet/CNN: minimum 50 epochs (200 recommended)\n"
        "  - FashionMNIST with small CNN: minimum 20 epochs\n"
        "  - RL environments: follow the RL STEP BUDGET below (CRITICAL)\n"
        "  - If time_budget is too short for minimum epochs, REDUCE model complexity\n"
        "    or dataset size INSTEAD of reducing epochs. 8 epochs on CIFAR-10 will\n"
        "    produce random-chance accuracy (~10%), making all comparisons meaningless.\n"
        "  - Use a SMALL model (simple CNN, few layers) to fit enough epochs into the budget.\n"
        "  - A converged small model is worth infinitely more than a diverged large model.\n"
        "- MANDATORY: use the experiment_harness module (pre-installed in sandbox):\n"
        "  ```\n"
        "  from experiment_harness import ExperimentHarness\n"
        "  harness = ExperimentHarness(time_budget={time_budget_sec})\n"
        "  # In your experiment loop:\n"
        "  if harness.should_stop():\n"
        "      break  # graceful stop at 80% of budget\n"
        "  if not harness.check_value(value, 'metric_name'):\n"
        "      print('SKIP: NaN/Inf detected')  # skip invalid values\n"
        "      continue\n"
        "  harness.report_metric('metric_name', value)  # validated output\n"
        "  # At the end of ALL experiments:\n"
        "  harness.finalize()  # writes results.json — MUST be called\n"
        "  ```\n"
        "  The harness provides: time budget enforcement, NaN/Inf detection,\n"
        "  validated metric reporting, and results.json output. NOT using it\n"
        "  means your metrics may be lost or malformed.\n"
    ),
    "topic_constraint": (
        "\n\n=== HARD TOPIC CONSTRAINT ===\n"
        "The paper MUST be about: {topic}\n"
        "PROHIBITED content (unless user explicitly specifies case-study mode):\n"
        "- Do NOT treat environment setup, dependency installation, or infrastructure "
        "failures as a research contribution.\n"
        "- Do NOT present debugging logs, system errors, or configuration issues "
        "as experimental findings.\n"
        "- Do NOT drift to tangential topics not directly related to the stated topic.\n"
        "- Every section MUST connect back to the core research question.\n"
        "- The Abstract and Introduction MUST clearly state the research problem "
        "derived from: {topic}\n"
        "- The Method section MUST describe a technical approach, not a workflow.\n"
        "- The Results section MUST report quantitative outcomes of experiments, "
        "not environment status.\n"
        "=== END CONSTRAINT ===\n"
    ),
    "pkg_hint_sandbox": (
        "\nAVAILABLE PACKAGES (sandbox mode): Python stdlib, numpy, math, random, "
        "statistics, json.\n"
        "Do NOT use: torch, tensorflow, jax, sklearn, pandas, scipy, matplotlib, "
        "or any deep learning framework.\n"
        "Write the experiment using ONLY numpy and stdlib.\n"
    ),
    "dataset_guidance": (
        "\n## Standard Datasets & Real Baselines (MANDATORY when applicable)\n"
        "You MUST use real benchmark datasets — NEVER synthetic torch.randn() data.\n\n"
        "### Tier 1: Pre-cached (ALWAYS available, use download=False)\n"
        "These datasets are already in the Docker image. Use download=False:\n"
        "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n"
        "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n"
        "### Tier 2: Downloadable (use setup.py to download before main.py runs)\n"
        "For any dataset NOT in Tier 1, create a `setup.py` file that downloads it.\n"
        "setup.py runs WITH network access; main.py runs WITHOUT network.\n"
        "- Any torchvision dataset (Caltech-101, Flowers102, etc.)\n"
        "- HuggingFace datasets: `from datasets import load_dataset`\n"
        "  Examples: IMDB, AG News, WikiText, SST-2, SQuAD, MMLU\n"
        "- OGB benchmarks: ogbg-molhiv, ogbn-arxiv, etc.\n"
        "- Tiny-ImageNet (237MB, 200 classes) — good ImageNet proxy\n\n"
        "### Tier 3: Too large for download (use alternatives)\n"
        "These datasets are TOO LARGE to download within experiment time limits:\n"
        "- ImageNet-1K (168GB) → use Tiny-ImageNet or CIFAR-100 as proxy\n"
        "- LAION (>1TB) → use smaller HuggingFace image-text datasets\n"
        "- Common Crawl, The Pile → use WikiText-103 or pre-tokenized subsets\n"
        "NEVER generate 'ImageNet-like' synthetic data — always use a real alternative.\n\n"
        "### ANTI-PATTERNS (NEVER DO THESE):\n"
        "- `torch.randn(N, 3, 224, 224)` as dataset → use real datasets\n"
        "- `download=True` in main.py → put downloads in setup.py\n"
        "- `download=False` for non-cached datasets → will FileNotFoundError\n"
        "- Random train/test splits → use official splits from dataset\n"
        "- `os.makedirs('/opt/datasets/...')` → /opt/datasets is READ-ONLY\n\n"
        "DATA PATH: For Tier 1 pre-cached datasets, use `/opt/datasets` as root.\n"
        "For Tier 2 datasets downloaded by setup.py, use `/workspace/data` as root.\n"
        "WARNING: `/opt/datasets` is READ-ONLY. NEVER call os.makedirs() on it.\n"
        "Just pass `root='/opt/datasets'` directly to torchvision dataset constructors.\n\n"
        "DISTRIBUTION SHIFT — use torchvision corruption transforms:\n"
        "- Gaussian noise: `transforms.Lambda(lambda x: x + torch.randn_like(x) * sigma)`\n"
        "- Brightness shift: `transforms.ColorJitter(brightness=0.5)`\n"
        "- Contrast shift: `transforms.ColorJitter(contrast=0.5)`\n"
        "- Blur: `transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0))`\n"
        "- For CIFAR-10-C style corruptions, apply transforms to test set only.\n\n"
        "REAL BASELINES & MODERN BENCHMARKS (CRITICAL):\n"
        "- Use proper train/test splits from the dataset (never split randomly in code)\n"
        "- Use standard architectures (ResNet-18/50, ViT, ConvNeXt) — not toy 2-layer MLPs\n"
        "- CIFAR INPUT SIZE (IMPORTANT): CIFAR images are 32×32. Two valid approaches:\n"
        "  1. PRETRAINED models (ImageNet weights): Use `transforms.Resize(224)` — "
        "pretrained models require 224×224 inputs.\n"
        "  2. TRAINING FROM SCRATCH (most experiments): Modify the model for 32×32 "
        "inputs instead of resizing. For ResNet: use `nn.Conv2d(3,64,3,1,1)` as "
        "first conv (not 7×7/stride-2) and REMOVE the initial MaxPool. This is 49× "
        "more memory-efficient and trains faster than Resize(224). Use the `timm` "
        "library's CIFAR variants or build a custom `get_resnet18_cifar()` helper.\n"
        "- Report standard metrics (top-1 accuracy for classification tasks)\n"
        "- Compare against published baselines where available\n"
        "- BASELINES MUST BE CURRENT: Use baselines from recent top-venue papers "
        "(2023-2026). Do NOT use outdated methods as the primary comparison.\n"
        "  * AlexNet, VGG-16 → use ResNet-50, ViT, ConvNeXt instead\n"
        "  * Vanilla SGD → use AdamW, SGD+momentum+cosine LR\n"
        "  * Simple RNN/LSTM for NLP → use Transformer-based models\n"
        "- Include at LEAST one strong, modern baseline (near-SOTA).\n"
        "- BENCHMARKS MUST BE STANDARD and actively used in the community.\n\n"
        "WHEN TO USE SYNTHETIC DATA (required for these domains):\n"
        "- **PDE / Scientific computing**: Generate synthetic PDE data (Burgers "
        "equation, Darcy flow, heat equation, Navier-Stokes). Use numerical solvers "
        "(scipy.integrate, finite differences) to create ground truth.\n"
        "- **Combinatorial optimization** (TSP, graph coloring, scheduling): Generate "
        "random problem instances (random TSP cities, Erdos-Renyi graphs).\n"
        "- **Theoretical analysis**: Synthetic optimization landscapes, toy problems.\n"
        "- **Domain with no standard dataset**: Novel combinatorial or mathematical domains.\n"
        "For these domains, do NOT use CIFAR/MNIST/ImageNet — they are irrelevant. "
        "Generate problem-specific synthetic data in main.py.\n\n"
        "DOMAIN-DATASET MATCHING (CRITICAL):\n"
        "- Image classification → CIFAR-10/100, MNIST, ImageNet variants\n"
        "- NLP → IMDB, AG News, SST-2, WikiText\n"
        "- Graph learning → Cora, CiteSeer, ogbn-arxiv\n"
        "- PDE/Physics → SYNTHETIC (Burgers, Darcy, Navier-Stokes)\n"
        "- Combinatorial optimization → SYNTHETIC (random TSP, graph instances)\n"
        "- RL → Gymnasium environments (CartPole, LunarLander, HalfCheetah)\n"
        "NEVER use image datasets for non-image problems.\n"
    ),
    "setup_script_guidance": (
        "\n## Setup Script (setup.py) — Dataset Download & Preparation\n"
        "If your experiment needs datasets NOT in the pre-cached list, generate "
        "a SEPARATE file called `setup.py` that downloads and prepares them.\n"
        "The setup.py runs WITH NETWORK ACCESS before main.py (which runs WITHOUT network).\n\n"
        "IMPORTANT: All download logic MUST be in setup.py, NOT in main.py.\n"
        "main.py should only load pre-cached data from /opt/datasets (download=False) "
        "or downloaded data from /workspace/data.\n\n"
        "Example setup.py:\n"
        "```python\n"
        "import os\n"
        "DATA_DIR = '/workspace/data'\n"
        "os.makedirs(DATA_DIR, exist_ok=True)\n\n"
        "# Download torchvision datasets\n"
        "import torchvision\n"
        "torchvision.datasets.Caltech101(root=DATA_DIR, download=True)\n\n"
        "# Download HuggingFace datasets\n"
        "from datasets import load_dataset\n"
        "ds = load_dataset('imdb', cache_dir=os.path.join(DATA_DIR, 'hf'))\n\n"
        "# Download OGB benchmarks\n"
        "# from ogb.graphproppred import PygGraphPropPredDataset\n"
        "# dataset = PygGraphPropPredDataset(name='ogbg-molhiv', root=DATA_DIR)\n\n"
        "print('[setup] Dataset download complete.')\n"
        "```\n\n"
        "IMPORT ANTI-PATTERN (NEVER DO THIS):\n"
        "```python\n"
        "from datasets import load_dataset\n"
        "datasets.load_dataset('imdb', ...)  # WRONG — NameError!\n"
        "```\n"
        "If you write `from datasets import load_dataset`, call `load_dataset(...)` directly.\n"
        "If you write `import datasets`, call `datasets.load_dataset(...)` with module prefix.\n"
        "NEVER mix the two styles.\n\n"
        "If ALL your datasets are pre-cached (CIFAR-10/100, MNIST, FashionMNIST, "
        "STL-10, SVHN), you do NOT need setup.py — just use download=False in main.py.\n\n"
        "You may also include a `requirements.txt` file listing any additional "
        "pip packages your experiment needs beyond the pre-installed set.\n"
    ),
    "network_disabled_guidance": (
        "\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n"
        "This experiment runs with network_policy='none'. There is NO network access\n"
        "at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n"
        "### ONLY these pre-cached datasets are available:\n"
        "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n"
        "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n"
        "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n"
        "### FORBIDDEN (will cause runtime failure):\n"
        "- Do NOT create setup.py (it cannot run without network)\n"
        "- Do NOT create requirements.txt (pip install is unavailable)\n"
        "- Do NOT use `download=True` on any dataset\n"
        "- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n"
        "- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n"
        "- Do NOT import packages not pre-installed in the Docker image\n\n"
        "### Available pre-installed packages:\n"
        "torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n"
        "pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n"
        "h5py, transformers, datasets, accelerate, peft, bitsandbytes.\n\n"
        "If your research topic requires a dataset NOT in the pre-cached list,\n"
        "you MUST adapt to use one of the 6 pre-cached datasets instead.\n"
    ),
    "network_full_guidance": (
        "\n## Network Access: Full\n"
        "This experiment runs with network_policy='full'. Network access is available\n"
        "throughout ALL execution phases (setup, pip install, and main experiment).\n"
        "You may download datasets, install packages, and make HTTP requests at any time.\n"
    ),
    "hp_reporting": (
        "\n## Hyperparameter Reporting (MANDATORY)\n"
        "At the TOP of main.py, define a HYPERPARAMETERS dictionary containing ALL "
        "tunable hyperparameters used in your experiment:\n"
        "```python\n"
        "HYPERPARAMETERS = {\n"
        "    'learning_rate': 0.001,\n"
        "    'batch_size': 64,\n"
        "    'num_epochs': 50,\n"
        "    'hidden_dim': 256,\n"
        "    # ... all other hyperparameters\n"
        "}\n"
        "```\n"
        "At the end of main.py, save hyperparameters to results.json:\n"
        "```python\n"
        "import json\n"
        "results = {'hyperparameters': HYPERPARAMETERS, 'metrics': collected_metrics}\n"
        "with open('results.json', 'w') as f:\n"
        "    json.dump(results, f, indent=2)\n"
        "```\n"
        "EVERY hyperparameter must be used in the code — no dead parameters.\n"
        "The paper MUST include a hyperparameter table — this data feeds into it.\n"
    ),
    "rl_step_guidance": (
        "\n## RL Training Step Budget (MANDATORY for RL experiments)\n"
        "Reinforcement learning requires MANY more training steps than supervised learning.\n"
        "Under-trained RL agents produce random-chance performance, making ALL comparisons\n"
        "meaningless and the paper unpublishable.\n\n"
        "### Environment Availability:\n"
        "#### Always available (classic control — no extra dependencies):\n"
        "- CartPole-v1, Pendulum-v1, MountainCar-v0, MountainCarContinuous-v0,\n"
        "  Acrobot-v1, LunarLander-v3\n"
        "- These are lightweight and fast — PREFER these unless MuJoCo is specifically required.\n\n"
        "#### MuJoCo environments (pre-installed in Docker image):\n"
        "- HalfCheetah-v5, Hopper-v5, Walker2d-v5, Ant-v5, Humanoid-v5,\n"
        "  Swimmer-v5, Reacher-v5, InvertedPendulum-v5, InvertedDoublePendulum-v5\n"
        "- Require MuJoCo runtime — available in Docker but NOT in basic sandbox mode.\n\n"
        "#### RULE: If the research topic says 'MuJoCo-free', 'without MuJoCo',\n"
        "  or 'classic control only' → you MUST use classic control environments ONLY.\n"
        "  Do NOT import or reference MuJoCo in any way.\n\n"
        "#### DEFAULT RECOMMENDATION: Prefer classic control environments unless the\n"
        "  research topic specifically requires MuJoCo locomotion tasks.\n\n"
        "### ALGORITHM-ENVIRONMENT COMPATIBILITY (HARD RULE — violation = crash):\n"
        "- DQN is ONLY for DISCRETE action spaces (CartPole, LunarLander, Acrobot, Atari).\n"
        "  DQN will CRASH on Pendulum, HalfCheetah, Hopper, Walker2d, etc.\n"
        "- For CONTINUOUS action spaces: use SAC, TD3, or PPO.\n"
        "- PPO works for both discrete and continuous.\n"
        "- NEVER combine DQN + any continuous environment.\n\n"
        "### TIME BUDGET RULES FOR RL:\n"
        "- If time_budget ≤ 3600s → ONLY classic control "
        "(CartPole, Pendulum, MountainCar, Acrobot, LunarLander)\n"
        "- If time_budget ≤ 1800s → ONLY CartPole or Pendulum (simplest)\n"
        "- MuJoCo requires >5000s for meaningful results.\n\n"
        "### Minimum Steps by Algorithm Family:\n"
        "| Algorithm | Environment | Min Steps | Recommended |\n"
        "|-----------|-------------|-----------|-------------|\n"
        "| PPO       | MuJoCo (Ant, HalfCheetah, Humanoid) | 500K | 1M-3M |\n"
        "| PPO       | Simple control (CartPole, Pendulum) | 100K | 500K |\n"
        "| SAC/TD3   | MuJoCo locomotion | 300K | 1M |\n"
        "| SAC/TD3   | Simple control | 50K  | 200K |\n"
        "| DQN/Rainbow | Atari | 1M | 10M |\n"
        "| A2C/A3C   | Any continuous | 500K | 2M |\n"
        "| REINFORCE | Any | 200K | 1M |\n\n"
        "### Step Budget Allocation Strategy:\n"
        "1. Compute pilot_time = time for 1000 steps of 1 condition.\n"
        "2. steps_per_sec = 1000 / pilot_time.\n"
        "3. max_steps_per_condition = (time_budget * 0.7) / num_conditions * steps_per_sec.\n"
        "4. If max_steps < min_steps for the algorithm, REDUCE num_seeds to 3 (not steps).\n"
        "5. If STILL under min_steps, use a simpler environment (e.g., Pendulum instead of Ant).\n"
        "6. NEVER reduce steps below the minimum — it wastes compute on meaningless results.\n\n"
        "### Evaluation Protocol for RL:\n"
        "- Evaluate every N_eval steps (e.g., every 10K steps) using deterministic policy.\n"
        "- Run 10 evaluation episodes per checkpoint.\n"
        "- Report: mean return, std return, success rate (if applicable).\n"
        "- Plot learning curves (return vs steps) — this is EXPECTED by reviewers.\n"
        "- Final metric = mean over last 10 evaluation checkpoints (NOT last episode).\n\n"
        "### Gymnasium Environment Version (CRITICAL):\n"
        "- Use v5 environments (NOT v4): `gym.make('HalfCheetah-v5')`, `gym.make('Hopper-v5')`\n"
        "- v4 environments are deprecated and will produce warnings.\n"
        "- Available MuJoCo v5 envs: HalfCheetah-v5, Hopper-v5, Walker2d-v5, Ant-v5,\n"
        "  Humanoid-v5, Swimmer-v5, Reacher-v5, InvertedPendulum-v5, InvertedDoublePendulum-v5\n"
        "- For simple/fast experiments: use Pendulum-v1, CartPole-v1, MountainCarContinuous-v0\n\n"
        "### Gymnasium API (CRITICAL — common crash source):\n"
        "- `env.reset()` returns `(obs, info)` — ALWAYS unpack both:\n"
        "  `obs, info = env.reset(seed=seed)`\n"
        "- `env.step(action)` returns `(obs, reward, terminated, truncated, info)` — 5 values:\n"
        "  `obs, reward, terminated, truncated, info = env.step(action)`\n"
        "  `done = terminated or truncated`\n"
        "- DO NOT use old `done = env.step(action)[2]` — this is the Gym (v0.26-) API.\n"
        "- `reward` is a scalar float, NOT an array. Do NOT index it: use `reward` directly.\n"
        "- `obs` shape depends on env: discrete envs give 1D array, image envs give 3D.\n"
        "  Always check `env.observation_space.shape` and handle accordingly.\n\n"
        "### Learning Curve Logging (MANDATORY for RL papers):\n"
        "- Print evaluation metrics at regular intervals: every N_eval steps\n"
        "  `EVAL: step=<S> condition=<C> seed=<seed> return=<R>`\n"
        "- This enables plotting learning curves (return vs training steps)\n"
        "- Learning curves are EXPECTED by RL reviewers — a paper without them\n"
        "  will be rejected regardless of final performance.\n"
        "- At the end, print the full curve:\n"
        "  `LEARNING_CURVE: condition=<C> seed=<seed> steps=[...] returns=[...]`\n"
    ),
    "multi_seed_enforcement": (
        "\n## Multi-Seed Experiment Requirement (MANDATORY — NO EXCEPTIONS)\n"
        "Running each condition with only 1 seed is NEVER acceptable. Results from\n"
        "a single seed cannot distinguish signal from noise and reviewers will reject.\n\n"
        "### HARD REQUIREMENT:\n"
        "- You MUST use exactly seeds = [0, 1, 2] (3 seeds minimum).\n"
        "- Each condition MUST loop over ALL seeds.\n"
        "- Print per-seed: `condition=X seed=S {metric_key}: V`\n"
        "- Print aggregated: `condition=X {metric_key}_mean: M {metric_key}_std: S`\n"
        "- Tables MUST show mean ± std, NEVER single-run values.\n\n"
        "### Implementation Pattern (copy this structure):\n"
        "```python\n"
        "SEEDS = [0, 1, 2]  # EXACTLY 3 seeds — mandatory minimum\n"
        "all_results = {}  # {condition_name: {seed: metric_value}}\n\n"
        "for condition_name, ConditionClass in conditions.items():\n"
        "    all_results[condition_name] = {}\n"
        "    for seed in SEEDS:\n"
        "        set_all_seeds(seed)  # torch, numpy, random\n"
        "        result = run_single(ConditionClass, seed=seed)\n"
        "        all_results[condition_name][seed] = result\n"
        "        print(f'condition={condition_name} seed={seed} metric: {result}')\n"
        "    values = list(all_results[condition_name].values())\n"
        "    print(f'condition={condition_name} metric_mean: {np.mean(values):.4f} '\n"
        "          f'metric_std: {np.std(values):.4f}')\n"
        "```\n\n"
        "### Reporting Requirements:\n"
        "- Print per-seed results: `condition=X seed=S metric: V`\n"
        "- Print aggregated: `condition=X metric_mean: M metric_std: S`\n"
        "- Tables in the paper MUST show mean ± std, NEVER single-run values.\n"
        "- If time budget forces < 5 seeds, use EXACTLY 3 seeds (minimum).\n"
        "  Print: `SEED_WARNING: only 3 seeds used due to time budget`.\n"
    ),
    "writing_structure": (
        "\n## Paper Section Writing Rules\n"
        "MARKDOWN FORMATTING (CRITICAL):\n"
        "- Use `# Title` (H1) for the paper title\n"
        "- Use `# Abstract`, `# Introduction`, `# Method`, etc. (H1) for MAIN sections\n"
        "- Use `## Subsection Name` (H2) for subsections WITHIN a main section\n"
        "- NEVER use `##` for main sections — that produces wrong LaTeX heading levels\n"
        "- Each main section (H1) MUST contain subsections (H2) when it exceeds 3 paragraphs\n"
        "- NEVER place sub-topics (e.g., 'Knowledge Distillation for Compact Models') "
        "at the same heading level as main sections (e.g., 'Related Work')\n"
        "- NEVER wrap the paper in ```markdown fences\n"
        "- NEVER use raw variable names (e.g., `method_name/metric_key = 0.85`) — "
        "always use human-readable text\n\n"
        "ABSTRACT (150-200 words, 5-sentence structure):\n"
        "- (1) Problem and significance (2) Prior approaches and gaps\n"
        "- (3) Your approach and novelty (4) Key results with 2-3 specific numbers\n"
        "- (5) Implication/takeaway\n"
        "- Do NOT list per-seed ranges (e.g., '0.71-0.73 across seeds') — use mean +/- std\n"
        "- Do NOT repeat numbers that appear in the Results section — pick the 2-3 most impactful\n\n"
        "INTRODUCTION (4 paragraphs, 800-1000 words, cite 8-12 references):\n"
        "Paragraph 1: Problem motivation (why this matters). "
        "Paragraph 2: What exists and why it falls short. "
        "Paragraph 3: Your approach and key insight. "
        "Paragraph 4: Contributions (2-3 bullet points allowed here ONLY).\n\n"
        "RELATED WORK:\n"
        "Organize by sub-topic, not chronologically. "
        "End each paragraph with how YOUR work differs from the cited work. "
        "Cite at least 15 references, all directly relevant.\n\n"
        "METHOD:\n"
        "Write as flowing narrative prose (NOT bullet points). "
        "Include full algorithm description with pseudocode or step-by-step. "
        "State all hyperparameters with values and justification. "
        "Provide architecture details sufficient for reproduction.\n\n"
        "RESULTS:\n"
        "- Do NOT repeat the same number more than twice across the paper\n"
        "- Each number in a table should be discussed AT MOST once in text\n"
        "- Tables: mean +/- std with 95% CI in parentheses\n"
        "- Bold the best result in each column\n"
        "- Every comparison claim must cite a p-value or note multiple seeds\n"
        "- Report the number of random seeds/runs used\n\n"
        "FIGURES AND TABLES:\n"
        "- Every figure MUST be referenced in the text (e.g., 'As shown in Figure 1')\n"
        "- Every table MUST be referenced in the text (e.g., 'Table 2 summarizes')\n"
        "- Figure captions: 1-2 descriptive sentences (not just 'Results comparison')\n"
        "- Table captions go ABOVE the table; figure captions go BELOW the figure\n"
        "- Axis labels must include units where applicable\n"
        "- Use consistent font sizes across all figures\n\n"
        "DISCUSSION (if applicable, can be merged into Results):\n"
        "- Paragraph 1: Summarize key findings and their significance\n"
        "- Paragraph 2: Compare with prior work — explain WHY results differ\n"
        "- Paragraph 3: Discuss unexpected or negative results honestly\n"
        "- Paragraph 4: Broader implications and practical applications\n\n"
        "LIMITATIONS (3-5 points):\n"
        "- State each limitation ONCE, here only — not scattered throughout\n"
        "- No disclaimers like 'due to computational constraints'\n"
        "- Include compute resources used (GPU type, training time)\n\n"
        "CONCLUSION:\n"
        "- Summarize findings (match actual results, no aspirational claims)\n"
        "- 2-3 sentences of future work\n\n"
        "PROSE QUALITY (CRITICAL — violation = desk reject):\n"
        "- Write FLOWING ACADEMIC PARAGRAPHS, not bullet-point lists.\n"
        "- Each paragraph must have 4-8 sentences with smooth transitions.\n"
        "- Introduction, Related Work, and Method must each be >=3 paragraphs.\n"
        "- FORBIDDEN: starting 3+ consecutive paragraphs with the same word.\n"
        "- FORBIDDEN: bullet-point lists in Introduction or Related Work sections.\n"
        "- Use varied sentence structures: mix simple, compound, and complex sentences.\n"
        "- Connect paragraphs with transition phrases: 'Building on this insight...', "
        "'In contrast to prior work...', 'To address this limitation...'.\n"
        "- Each Related Work paragraph must COMPARE your approach to cited work, "
        "not merely summarize what each paper does.\n"
        "- FORBIDDEN AI-BOILERPLATE phrases (instant credibility loss):\n"
        "  'delves into', 'it is worth noting', 'plays a crucial role',\n"
        "  'leverages the power of', 'paves the way', 'a myriad of',\n"
        "  'paradigm shift', 'groundbreaking', 'in the realm of',\n"
        "  'holistic approach', 'multifaceted', 'navigate the complexities'.\n"
        "  Replace ALL such phrases with precise, specific academic language.\n"
    ),
    "llm_training_guidance": (
        "\n## LLM Fine-Tuning Guidance (when topic involves language model training)\n"
        "AVAILABLE FRAMEWORKS (pre-installed in Docker):\n"
        "- transformers (AutoModelForCausalLM, AutoTokenizer, Trainer)\n"
        "- peft (LoraConfig, get_peft_model, PeftModel)\n"
        "- trl (SFTTrainer, DPOTrainer, GRPOTrainer)\n"
        "- datasets (load_dataset, Dataset)\n"
        "- accelerate (Accelerator)\n"
        "- bitsandbytes (4-bit/8-bit quantization)\n\n"
        "GPU MEMORY GUIDELINES (RTX 6000 Ada, 49GB VRAM):\n"
        "- Full fine-tune: <=3B parameters\n"
        "- LoRA (16-bit): <=14B parameters\n"
        "- QLoRA (4-bit): <=72B parameters (practical limit ~14B for training)\n"
        "- Optimal: 7B-14B model with QLoRA (rank 16-64)\n\n"
        "RECOMMENDED TRAINING PATTERN:\n"
        "```python\n"
        "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n"
        "from peft import LoraConfig, get_peft_model, TaskType\n"
        "from trl import SFTTrainer, SFTConfig\n"
        "from datasets import load_dataset\n\n"
        "# 4-bit quantization for memory efficiency\n"
        "bnb_config = BitsAndBytesConfig(\n"
        "    load_in_4bit=True,\n"
        "    bnb_4bit_quant_type='nf4',\n"
        "    bnb_4bit_compute_dtype=torch.bfloat16,\n"
        ")\n"
        "model = AutoModelForCausalLM.from_pretrained(\n"
        "    model_name, quantization_config=bnb_config, device_map='auto'\n"
        ")\n"
        "lora_config = LoraConfig(\n"
        "    r=16, lora_alpha=32, target_modules='all-linear',\n"
        "    lora_dropout=0.05, task_type=TaskType.CAUSAL_LM,\n"
        ")\n"
        "model = get_peft_model(model, lora_config)\n"
        "```\n\n"
        "KEY HYPERPARAMETERS:\n"
        "- learning_rate: 1e-4 to 2e-4 (LoRA), 5e-5 to 1e-4 (full FT)\n"
        "- lora_r: 8 (minimal) to 64 (high-capacity)\n"
        "- lora_alpha: typically 2x lora_r\n"
        "- batch_size: 1-4 per device (use gradient_accumulation_steps for effective batch)\n"
        "- gradient_accumulation_steps: 4-16 (effective_batch = per_device * accum)\n"
        "- max_seq_length: 512 (short), 1024-2048 (standard), 4096 (long)\n"
        "- warmup_ratio: 0.03-0.1\n"
        "- weight_decay: 0.01-0.1\n\n"
        "DATA FORMAT (use datasets library):\n"
        "- Instruction tuning: {'instruction': '...', 'output': '...'}\n"
        "- Chat format: {'messages': [{'role': 'user', 'content': '...'}, ...]}\n"
        "- DPO: {'prompt': '...', 'chosen': '...', 'rejected': '...'}\n"
        "- Use load_dataset('json', data_files='train.json') for local data\n"
        "- Use load_dataset('HuggingFace/dataset_name') for HF Hub datasets\n\n"
        "EVALUATION:\n"
        "- Use evaluate library for standard metrics\n"
        "- Common: perplexity, ROUGE (summarization), BLEU (translation), accuracy\n"
        "- LLM benchmarks: MMLU, ARC, HellaSwag, TruthfulQA\n"
        "- Generate sample outputs for qualitative comparison\n\n"
        "MODEL DOWNLOAD:\n"
        "- Models will be downloaded from HuggingFace Hub at runtime\n"
        "- Use 'trust_remote_code=True' for custom model architectures\n"
        "- Cache directory: default HF cache (~/.cache/huggingface)\n"
        "- Common models: Qwen/Qwen2.5-7B, meta-llama/Llama-3.1-8B, "
        "microsoft/Phi-4, google/gemma-2-9b\n\n"
        "CRITICAL — NO SIMULATION:\n"
        "- You MUST load and train a REAL model from HuggingFace Hub.\n"
        "- NEVER simulate training with synthetic utility functions or random scores.\n"
        "- NEVER replace model training with np.random/torch.randn mock results.\n"
        "- A real experiment loads a model, tokenizes data, runs optimizer steps, "
        "and measures real loss/perplexity/accuracy on held-out data.\n"
        "- If compute budget is tight, use a SMALLER model (Qwen2.5-0.5B or 1.5B) "
        "with fewer training steps rather than simulating.\n"
    ),
    "llm_eval_guidance": (
        "\n## LLM Evaluation Guidance\n"
        "STANDARD BENCHMARKS:\n"
        "- Reasoning: MMLU, ARC-Challenge, HellaSwag, WinoGrande\n"
        "- Math: GSM8K, MATH, MathVista\n"
        "- Coding: HumanEval, MBPP, LiveCodeBench\n"
        "- Safety: TruthfulQA, BBQ, CrowS-Pairs\n"
        "- Instruction following: MT-Bench, AlpacaEval, IFEval\n"
        "- Multimodal: MMBench, POPE, MathVista, MMMU\n\n"
        "EVALUATION FRAMEWORKS:\n"
        "- lm-eval-harness: Standard eval framework, run via CLI or Python API\n"
        "- vllm: Fast inference engine for throughput-focused evaluation\n"
        "- lighteval: HuggingFace's lightweight eval framework\n\n"
        "EVALUATION PROTOCOL:\n"
        "- Report on at least 3 benchmarks relevant to the task\n"
        "- Compare with published baselines from model cards/leaderboards\n"
        "- Report both zero-shot and few-shot results where applicable\n"
        "- Include perplexity on held-out test set\n"
    ),
    # IMP-20: Academic writing style guide (from NeurIPS/ICLR/ICML 2024-2025 best papers)
    "academic_style_guide": (
        "\n## ACADEMIC WRITING STANDARDS (from NeurIPS/ICLR/ICML 2024-2025 best papers)\n\n"
        "### Title Standards\n"
        "- Target 8-14 words. Median of award-winning papers: ~10 words.\n"
        "- Preferred format: 'SystemName: Descriptive Subtitle' (35% of best papers)\n"
        "  e.g., 'AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models'\n"
        "- Alternative: Declarative statement that surprises\n"
        "  e.g., 'Not All Tokens Are What You Need for Pretraining'\n"
        "- Give your method a memorable, catchy name (VAR, Genie, PRISM, SEDD).\n"
        "- NEVER exceed 18 words. NEVER use 'A Novel Approach to...' or 'Investigating...'\n\n"
        "### Abstract Standards (PMR+ Structure, 180-220 words)\n"
        "S1-S2: PROBLEM — State the gap. Open with a challenge or status-quo critique.\n"
        "S3-S4: METHOD — Name your system by sentence 3. Describe the key insight.\n"
        "S5-S6: RESULTS — At least 2-3 concrete quantitative claims:\n"
        "  - One relative improvement ('36.7% boost over baseline')\n"
        "  - One absolute benchmark score ('FID of 1.01 on ImageNet')\n"
        "AVOID: Per-seed ranges, excessive texttt, defensive hedging.\n\n"
        "### Section Writing Standards\n"
        "INTRODUCTION (800-1000 words, 4 paragraphs):\n"
        "  - Para 1: Motivation; Para 2: Gap (cite 3-5 papers); Para 3: Your approach;\n"
        "    Para 4: Contributions (bullet list of 3-4 specific contributions)\n"
        "  - MUST cite 8-12 references throughout Introduction\n\n"
        "RELATED WORK (600-800 words):\n"
        "  - Organize by sub-topic (2-3 subsections), NOT as a flat list\n"
        "  - End each subsection with how YOUR work differs\n"
        "  - Target >= 15 unique references in this section alone\n\n"
        "METHOD (1000-1500 words):\n"
        "  - Start with problem formulation (notation, objective function)\n"
        "  - Use algorithm environment for pseudocode (not verbatim)\n"
        "  - Write as a flowing narrative, NOT bullet points\n\n"
        "EXPERIMENTS (800-1200 words):\n"
        "  - Experimental setup as subsection (datasets, baselines, metrics, hardware)\n"
        "  - Hyperparameter table (Table 1 always)\n"
        "  - MUST reference figures: 'As shown in Figure 1, our method...'\n"
        "  - MUST cite baseline method papers (not just name them)\n\n"
        "RESULTS (600-800 words):\n"
        "  - Main results table with descriptive caption\n"
        "  - Ablation study table\n"
        "  - Analysis paragraphs connecting numbers to insights\n"
        "  - DO NOT repeat the same numbers from Experiments section\n"
        "  - Reference figures for visual evidence\n\n"
        "DISCUSSION (400-600 words):\n"
        "  - Compare findings with prior work (cite papers here!)\n"
        "  - Explain surprising results; broader implications\n\n"
        "LIMITATIONS (200-300 words): 3-5 specific, concrete limitations. ALL caveats go HERE.\n\n"
        "CONCLUSION: Summarize in 2-3 sentences, future work in 2-3 sentences.\n\n"
        "### Writing Quality Rules\n"
        "- Write as FLOWING PROSE, not bullet points or enumerated lists\n"
        "- Each paragraph: topic sentence, evidence, analysis, transition\n"
        "- Use transitions: 'Building on this insight...', 'In contrast to...'\n"
        "- Academic tone: confident but precise\n"
        "- Vary sentence structure: mix short declarative with longer analytical\n"
        "- AVOID: Starting 3+ consecutive sentences with 'We', 'The', 'Our'\n"
        "- AVOID: 'It is worth noting that', 'It should be mentioned that' (filler)\n"
        "- Citations belong in EVERY section, not just Introduction and Related Work\n"
    ),
    # IMP-25: Narrative writing requirements
    "narrative_writing_rules": (
        "\n## NARRATIVE WRITING REQUIREMENTS\n\n"
        "You are writing a paper for human reviewers at a top AI conference. The paper\n"
        "must read like a cohesive academic story, NOT a technical report or bullet list.\n\n"
        "### Structure of Each Paragraph\n"
        "Every paragraph MUST follow this pattern:\n"
        "1. TOPIC SENTENCE — states the main claim or finding\n"
        "2. EVIDENCE — data, citations, or reasoning that supports the claim\n"
        "3. ANALYSIS — what the evidence means, why it matters\n"
        "4. TRANSITION — connects to the next paragraph's topic\n\n"
        "### FORBIDDEN Writing Patterns\n"
        "- Bullet-point lists in the main body (ONLY allowed in Contributions paragraph\n"
        "  of Introduction and Limitations section)\n"
        "- Numbered lists of findings or results\n"
        "- Starting a paragraph with 'Table X shows...' without context first\n"
        "- Consecutive short sentences without analysis between them\n"
        "- Repeating the same sentence structure 3+ times in a row\n\n"
        "### REQUIRED Writing Patterns\n"
        "- Transition phrases: 'Building on this observation...', 'In contrast to prior work...'\n"
        "- Vary sentence length: alternate between short impactful and longer analytical\n"
        "- Ground every claim in evidence: '[Result] because [mechanism] (cite)'\n"
        "- Discuss implications: 'This X% improvement indicates that [mechanism Y]\n"
        "  is more effective than [mechanism Z] for [context]'\n"
        "- For temporal data: describe trends in prose rather than bullet-point lists\n\n"
        "### Example: BAD vs GOOD Method Description\n"
        "BAD (bullet-list style):\n"
        "  'Our method has three components:\n"
        "   - Component A\n"
        "   - Component B\n"
        "   - Component C'\n\n"
        "GOOD (narrative style):\n"
        "  'Our method builds on the insight that [core problem] stems from\n"
        "   [root cause identified in Section 2]. To address this, we introduce\n"
        "   [MethodName], a [N]-stage framework. First, [Stage 1] maps inputs\n"
        "   to [representation]. These representations feed into [Stage 2],\n"
        "   enabling [benefit] without [drawback of prior approaches].\n"
        "   Crucially, we augment this with [Stage 3] based on [technical\n"
        "   foundation] (cite original paper), triggering [mechanism] when\n"
        "   [condition is met].'\n"
        "  NOTE: Replace all [placeholders] with YOUR actual method details.\n"
        "  Do NOT copy this template verbatim.\n"
    ),
    # IMP-31: Anti-hedging rules
    "anti_hedging_rules": (
        "\n## ANTI-HEDGING RULES (MANDATORY)\n"
        "1. The following phrases are BANNED from the paper body:\n"
        "   - 'we do not claim' / 'we cannot claim'\n"
        "   - 'we intentionally frame this conservatively'\n"
        "   - 'the evidence does not support' (unless followed by what it DOES support)\n"
        "   - 'only N seeds/runs' (belongs ONLY in Limitations, stated ONCE)\n"
        "   - 'this paper is not' / 'we do not' as paragraph openers\n"
        "2. Limitations and caveats MUST be consolidated in the Limitations section.\n"
        "   They may NOT appear in Introduction, Method, Results, or Conclusion.\n"
        "3. Confidence framing: Instead of 'we cannot prove X', write 'our results\n"
        "   provide evidence for X' or 'X is supported by [metrics]'.\n"
        "4. If you have a negative result, frame it as an INSIGHT:\n"
        "   BAD: 'Our method failed to outperform the baseline, we do not claim...'\n"
        "   GOOD: 'Surprisingly, the standard baseline proved competitive, suggesting\n"
        "   that [insight about why] — an observation with practical implications for...'\n"
    ),
    # IMP-24: Anti-repetition rules
    "anti_repetition_rules": (
        "\n## ANTI-REPETITION RULE\n"
        "Each specific number (e.g., '0.7667', '36.7%') may appear in AT MOST 2 sections:\n"
        "  - Once in Results/Experiments (where it is first reported)\n"
        "  - Once in Abstract (as a summary highlight)\n"
        "The Introduction, Discussion, and Conclusion MUST refer to results qualitatively\n"
        "('significantly outperformed', 'X% improvement') WITHOUT repeating exact numbers\n"
        "from the Results section. Violation of this rule will result in desk rejection.\n"
    ),
}

# -- Debate role prompts (multi-perspective generation) -------------------

DEBATE_ROLES_HYPOTHESIS: dict[str, dict[str, str]] = {
    "innovator": {
        "system": (
            "You are a bold, creative researcher who thinks outside the box. "
            "You pursue high-risk high-reward ideas, draw cross-domain analogies, "
            "and propose counter-intuitive hypotheses that challenge mainstream thinking."
        ),
        "user": (
            "Generate at least 2 novel, unconventional hypotheses from the synthesis below.\n"
            "CRITICAL REQUIREMENTS for EVERY hypothesis:\n"
            "1. NOVELTY: Must go beyond incremental combination of existing methods.\n"
            "2. FEASIBILITY: Must be testable within 30 minutes of compute on a single GPU.\n"
            "3. FALSIFIABILITY: Must define a specific metric threshold that would reject it.\n"
            "For each hypothesis provide:\n"
            "- A bold claim that pushes boundaries\n"
            "- Cross-domain inspiration (if applicable)\n"
            "- Rationale grounded in the literature gaps\n"
            "- Measurable prediction and failure condition\n"
            "- Estimated risk level (low/medium/high)\n\n"
            "Topic: {topic}\n"
            "Synthesis:\n{synthesis}"
        ),
    },
    "pragmatist": {
        "system": (
            "You are a practical ML engineer focused on what actually works. "
            "You prioritize computational feasibility, engineering simplicity, "
            "reliable baselines, and incremental but solid improvements."
        ),
        "user": (
            "Generate at least 2 feasible, well-grounded hypotheses from the synthesis below.\n"
            "For each hypothesis provide:\n"
            "- A concrete, testable claim with clear methodology\n"
            "- Why this is achievable with limited compute\n"
            "- Rationale based on proven techniques\n"
            "- Measurable prediction and failure condition\n"
            "- Resource requirements estimate\n\n"
            "Topic: {topic}\n"
            "Synthesis:\n{synthesis}"
        ),
    },
    "contrarian": {
        "system": (
            "You are a rigorous devil's advocate who challenges assumptions. "
            "You find blind spots, hidden failure modes, and counter-evidence. "
            "Your value is in finding problems others ignore. Be provocative "
            "but always grounded in evidence."
        ),
        "user": (
            "Critically examine the synthesis and generate at least 2 contrarian hypotheses.\n"
            "For each hypothesis provide:\n"
            "- A challenge to a widely-held assumption in this area\n"
            "- Evidence or reasoning for why the mainstream view may be wrong\n"
            "- An alternative hypothesis that accounts for overlooked factors\n"
            "- Measurable prediction and failure condition\n"
            "- Potential negative results that would be informative\n\n"
            "Topic: {topic}\n"
            "Synthesis:\n{synthesis}"
        ),
    },
}

DEBATE_ROLES_ANALYSIS: dict[str, dict[str, str]] = {
    "optimist": {
        "system": (
            "You highlight positive findings, promising extensions, and silver linings "
            "in experimental results. You identify what worked well and why, "
            "and suggest how to build on successes."
        ),
        "user": (
            "Analyze the experiment results from an optimistic perspective.\n"
            "Cover:\n"
            "- What worked well and why\n"
            "- Unexpected positive findings\n"
            "- Promising extensions and next steps\n"
            "- Silver linings in any negative results\n\n"
            "{preamble}\n{data_context}\n"
            "Run context:\n{context}"
        ),
    },
    "skeptic": {
        "system": (
            "You question the significance of results with maximum rigor. "
            "You check statistical validity, identify confounds, and demand "
            "stronger evidence. Every claim must earn its place."
        ),
        "user": (
            "Critically scrutinize the experiment results.\n"
            "Cover:\n"
            "- Statistical concerns (significance, sample size, multiple comparisons)\n"
            "- Potential confounds and alternative explanations\n"
            "- Missing evidence or controls\n"
            "- Whether metrics truly capture the intended phenomenon\n\n"
            "{preamble}\n{data_context}\n"
            "Run context:\n{context}"
        ),
    },
    "methodologist": {
        "system": (
            "You scrutinize HOW experiments were conducted. You audit "
            "internal/external validity, reproducibility, baseline fairness, "
            "and evaluation protocols."
        ),
        "user": (
            "Audit the experimental methodology.\n"
            "Cover:\n"
            "- Baseline fairness and completeness\n"
            "- Metric appropriateness for the research question\n"
            "- Evaluation protocol (data leakage, contamination risks)\n"
            "- Ablation completeness\n"
            "- Reproducibility assessment\n"
            "- Specific methodology improvements needed\n\n"
            "{preamble}\n{data_context}\n"
            "Run context:\n{context}"
        ),
    },
}

# -- Sub-prompts (secondary LLM calls within a stage) --------------------

_DEFAULT_SUB_PROMPTS: dict[str, dict[str, Any]] = {
    "hypothesis_synthesize": {
        "system": (
            "You are a senior research director synthesizing multiple perspectives "
            "into a decisive research proposal. The best synthesis is not a "
            "compromise but takes the strongest elements from each viewpoint. "
            "Preserve genuine disagreements — do not flatten controversy."
        ),
        "user": (
            "Below are hypotheses generated from three different research perspectives.\n"
            "Synthesize them into a final set of 2-4 hypotheses that:\n"
            "1. Take the strongest, most novel ideas\n"
            "2. Address critical concerns raised by the contrarian\n"
            "3. Ensure feasibility (pragmatist's input)\n"
            "4. Note unresolved disagreements between perspectives\n"
            "5. For each final hypothesis: rationale, measurable prediction, "
            "failure condition\n\n"
            "{perspectives}"
        ),
    },
    "analysis_synthesize": {
        "system": (
            "You are a senior research director synthesizing multiple analytical "
            "perspectives into a comprehensive assessment. Find the truth — if "
            "the skeptic or methodologist raise valid concerns, acknowledge them. "
            "Do not suppress criticism."
        ),
        "user": (
            "Below are analyses from three different perspectives (optimist, "
            "skeptic, methodologist).\n"
            "Produce a unified analysis that:\n"
            "1. Identifies consensus points (high-confidence conclusions)\n"
            "2. Resolves conflicts with evidence-based judgment\n"
            "3. Rates result quality (1-10 with justification)\n"
            "4. Lists 3-5 key findings\n"
            "5. Notes methodology gaps that need addressing\n"
            "6. Gives a clear PROCEED/PIVOT/REFINE recommendation\n\n"
            "Required sections: Metrics Summary, Consensus Findings, "
            "Contested Points, Statistical Checks, Methodology Audit, "
            "Limitations, Conclusion.\n\n"
            "{perspectives}"
        ),
        "max_tokens": 8192,
    },
    "code_repair": {
        "system": "You fix Python code validation errors while preserving functionality.",
        "user": (
            "The file `{fname}` in the experiment project has validation errors. "
            "Fix ALL issues and return ONLY the corrected file.\n\n"
            "## Validation Issues in {fname}\n{issues_text}\n\n"
            "## All Project Files\n{all_files_ctx}\n\n"
            "IMPORTANT: Do NOT use subprocess, os.system, eval, exec, or any "
            "network/shell calls.\n"
            "NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, "
            "np.bool/int/float→Python builtins.\n"
            "Return ONLY the corrected code for `{fname}`."
        ),
    },
    "iterative_improve": {
        "system": (
            "You improve experiment projects and return valid executable Python code. "
            "Use ```filename:xxx.py format for each file."
        ),
        "user": (
            "Improve the experiment code based on prior run results.\n"
            "Return the improved files using ```filename:xxx.py format for each file.\n"
            "Primary metric key: {metric_key}\n"
            "Metric direction: {metric_direction}\n"
            "Do not use subprocess, os.system, eval, exec, or any network/shell calls.\n"
            "NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, "
            "np.bool/int/float→Python builtins, np.math→math.\n\n"
            "EXPERIMENT PLAN ANCHOR (CRITICAL — read before making changes):\n"
            "The research topic is: {topic}\n"
            "{exp_plan_anchor}"
            "RULES FOR REFINEMENT:\n"
            "- NEVER rename, remove, or replace existing condition names. "
            "The condition names in the code MUST match the experiment plan.\n"
            "- NEVER add new conditions that are not in the experiment plan.\n"
            "- ONLY improve the IMPLEMENTATION of existing conditions "
            "(fix bugs, tune hyperparameters, improve training loops).\n"
            "- If the code has fundamental issues (wrong algorithm, missing "
            "components), fix the implementation but keep the same condition "
            "names and class hierarchy.\n\n"
            "{condition_coverage_hint}"
            "SEED ENFORCEMENT (MANDATORY — BUG-183):\n"
            "- You MUST use exactly seeds = [0, 1, 2] (3 seeds minimum).\n"
            "- Each condition MUST loop over ALL seeds.\n"
            "- Print per-seed: condition=X seed=S {metric_key}: V\n"
            "- Print aggregated: condition=X {metric_key}_mean: M {metric_key}_std: S\n"
            "- If 3 seeds × all conditions exceeds the time budget, REDUCE training "
            "epochs or conditions — NEVER reduce seed count below 3.\n\n"
            "CONDITION COUNT LIMIT (HARD RULE):\n"
            "- MAXIMUM 8 total conditions (baselines + methods + ablations).\n"
            "- If the previous code had >8 conditions, consolidate ablations to 2-3 values.\n\n"
            "DOCKER MOUNT TOPOLOGY (for fixing PermissionError/path issues):\n"
            "- WRITABLE: /workspace/ (project files), /tmp/, /workspace/data/\n"
            "- READ-ONLY: /opt/datasets/ (pre-cached CIFAR-10/100, MNIST, etc)\n"
            "- If you see PermissionError on /opt/datasets, do NOT call "
            "os.makedirs() there. Use root='/opt/datasets' with download=False.\n"
            "- For new data downloads, use /workspace/data/ as root.\n\n"
            "Current project files:\n{files_context}\n"
            "Run summaries (JSON):\n{run_summaries}"
        ),
        "max_tokens": 8192,
    },
    "iterative_repair": {
        "system": "You fix Python validation issues without adding unsafe behavior.",
        "user": (
            "Fix all validation issues in main.py and return corrected Python code only.\n\n"
            "## Validation Issues\n{issue_text}\n\n"
            "## Common RL Stability Fixes (apply if NaN/divergence detected):\n"
            "- Add gradient clipping: `torch.nn.utils.clip_grad_norm_(params, 1.0)`\n"
            "- Lower learning rate to 1e-4 or 3e-4\n"
            "- Add reward normalization/clipping: `reward = np.clip(reward, -10, 10)`\n"
            "- Add NaN guard: `if torch.isnan(loss): continue`\n"
            "- Use float32 (not float16) for RL value functions\n"
            "- NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, "
            "np.bool/int/float→Python builtins\n\n"
            "## All Project Files\n{all_files_ctx}"
        ),
    },
    # ── Advanced Code Agent sub-prompts ──────────────────────────────────
    "architecture_planning": {
        "system": (
            "You are a senior software architect who designs implementation "
            "blueprints for scientific experiment codebases. You produce detailed, "
            "directly-implementable specifications with pseudocode for every "
            "class method and explicit tensor shape annotations. You emphasize "
            "separation of concerns: data loading, model definition, training "
            "loop, and evaluation are distinct components. You understand ML "
            "training deeply and design for correctness: proper .detach(), "
            "consistent tensor shapes, and correct gradient flow.\n\n"
            "NUMPY 2.x COMPATIBILITY (CRITICAL):\n"
            "- np.trapz is REMOVED → use np.trapezoid\n"
            "- np.erfinv does NOT exist → use scipy.special.erfinv\n"
            "- np.bool, np.int, np.float, np.complex are REMOVED → use Python builtins\n"
            "- np.str, np.object are REMOVED → use str, object\n"
            "- np.math is REMOVED → use math module"
        ),
        "user": (
            "Create a detailed IMPLEMENTATION BLUEPRINT for an experiment codebase.\n\n"
            "## Research Context\n"
            "TOPIC: {topic}\n"
            "PRIMARY METRIC: {metric}\n\n"
            "## Experiment Plan\n{exp_plan}\n\n"
            "## Requirements\n"
            "1. `main.py` MUST be the entry point — runs ALL conditions sequentially.\n"
            "2. Each condition MUST be a SEPARATE class with DISTINCT implementation.\n"
            "3. Data loading and model definitions in separate modules.\n"
            "4. No more than 5 Python files total.\n"
            "5. Every class must have at least 20 lines of effective code.\n"
            "6. Child classes MUST override at least one core method with DIFFERENT logic.\n"
            "7. NEVER override nn.Module.train/eval with different signatures.\n"
            "8. Design child classes as STRATEGY variants, not PARAMETER variants.\n\n"
            "## Blueprint Format (YAML)\n"
            "The blueprint MUST include ALL of the following for EACH file:\n"
            "- `generation_order`: integer (1=first to generate, higher=later)\n"
            "- `dependencies`: list of other files this file imports from\n"
            "- `classes` or `functions`: with pseudocode for each method\n"
            "- For neural network classes: input/output tensor shapes\n\n"
            "```yaml\n"
            "files:\n"
            "  - name: config.py\n"
            "    generation_order: 1\n"
            "    dependencies: []\n"
            "    purpose: Hyperparameter configuration\n"
            "    classes:\n"
            "      - name: Config\n"
            "        fields:\n"
            "          - lr: 0.01\n"
            "          - batch_size: 128\n"
            "          - epochs: 20\n"
            "          - hidden_dim: 128\n\n"
            "  - name: data.py\n"
            "    generation_order: 2\n"
            "    dependencies: [config.py]\n"
            "    purpose: Dataset loading and preprocessing\n"
            "    functions:\n"
            "      - name: get_dataloaders\n"
            "        signature: (config) -> (train_loader, val_loader, test_loader)\n"
            "        pseudocode: |\n"
            "          1. Load dataset from torchvision/disk\n"
            "          2. Apply standard transforms (normalize, augment)\n"
            "          3. Split train into train/val (90/10)\n"
            "          4. Return DataLoaders with config.batch_size\n\n"
            "  - name: models.py\n"
            "    generation_order: 3\n"
            "    dependencies: [config.py]\n"
            "    purpose: All model implementations\n"
            "    classes:\n"
            "      - name: BaseModel(nn.Module)\n"
            "        input_shape: [B, 3, 32, 32]\n"
            "        output_shape: [B, 10]\n"
            "        methods:\n"
            "          - name: __init__\n"
            "            pseudocode: Define layers (conv/linear/attention)\n"
            "          - name: forward\n"
            "            pseudocode: |\n"
            "              1. x = self.encoder(x)  # [B,3,32,32] -> [B, hidden]\n"
            "              2. logits = self.classifier(x)  # [B, hidden] -> [B, 10]\n"
            "              3. return logits\n"
            "      - name: ProposedMethod(BaseModel)\n"
            "        differentiator: Uses novel component X\n"
            "        overrides: [forward]\n"
            "        methods:\n"
            "          - name: forward\n"
            "            pseudocode: |\n"
            "              1. x = self.encoder(x)\n"
            "              2. x = self.novel_component(x)  # KEY DIFFERENCE\n"
            "              3. logits = self.classifier(x)\n"
            "              4. return logits\n"
            "          - name: compute_special_loss\n"
            "            pseudocode: |\n"
            "              1. Compute task loss: CE(logits, labels)\n"
            "              2. Compute novel regularizer\n"
            "              3. return task_loss + lambda * reg\n\n"
            "  - name: training.py\n"
            "    generation_order: 4\n"
            "    dependencies: [config.py, data.py, models.py]\n"
            "    purpose: Training loop and evaluation\n"
            "    functions:\n"
            "      - name: train_one_epoch\n"
            "        signature: (model, loader, optimizer, device) -> float\n"
            "        pseudocode: |\n"
            "          1. model.train()\n"
            "          2. For each batch: forward, loss, backward, step\n"
            "          3. Return average loss\n"
            "      - name: evaluate\n"
            "        signature: (model, loader, device) -> dict\n"
            "        pseudocode: |\n"
            "          1. model.eval() with torch.no_grad()\n"
            "          2. For each batch: forward, argmax predictions\n"
            "          3. Return {accuracy, loss}\n\n"
            "  - name: main.py\n"
            "    generation_order: 5\n"
            "    dependencies: [config.py, data.py, models.py, training.py]\n"
            "    purpose: Entry point — runs ALL conditions\n"
            "    contract:\n"
            "      prints_metric_def: true\n"
            "      prints_registered_conditions: true\n"
            "      runs_all_conditions: true\n"
            "      per_seed_reporting: true\n"
            "      time_budget_guard: true\n"
            "    functions:\n"
            "      - name: main\n"
            "        pseudocode: |\n"
            "          1. Print METRIC_DEF line\n"
            "          2. Print REGISTERED_CONDITIONS\n"
            "          3. Setup time budget guard\n"
            "          4. For each condition:\n"
            "             a. Create model instance\n"
            "             b. For each seed:\n"
            "                - Set random seed\n"
            "                - Train model\n"
            "                - Evaluate and print per-seed metrics\n"
            "             c. Print mean/std across seeds\n"
            "          5. Print SUMMARY comparison\n\n"
            "verification_criteria:\n"
            "  - All condition classes have DIFFERENT forward/step implementations\n"
            "  - Input/output tensor shapes are consistent across data->model->loss\n"
            "  - Time budget guard exists in main training loop\n"
            "  - Per-seed random state isolation\n"
            "  - All .detach() calls present for values used across iterations\n\n"
            "conditions:\n"
            "  - name: ConditionName\n"
            "    class: ClassName\n"
            "    description: What makes it different\n"
            "```\n\n"
            "Output ONLY the YAML specification wrapped in ```yaml``` fences.\n"
            "Be SPECIFIC in pseudocode — include tensor shapes, loss formulas, "
            "and algorithmic details from the experiment plan.\n"
            "Every class must have detailed pseudocode showing HOW it differs "
            "from others, not just THAT it differs."
        ),
        "max_tokens": 8192,
    },
    "generate_single_file": {
        "system": (
            "You are an expert ML engineer who writes production-quality Python code "
            "for scientific experiments. You follow implementation blueprints precisely, "
            "ensuring tensor shapes match, gradients flow correctly, and all imports "
            "resolve. You write complete, runnable code — never stubs or placeholders."
        ),
        "user": (
            "Generate the Python file `{file_name}` for an ML experiment project.\n\n"
            "## File Specification\n{file_spec}\n\n"
            "## Full Project Blueprint\n{blueprint}\n\n"
            "## Already Generated Files (summaries)\n{dependency_summaries}\n\n"
            "## Already Generated Files (full code of direct dependencies)\n"
            "{dependency_code}\n\n"
            "## Research Topic\n{topic}\n\n"
            "## Experiment Plan\n{exp_plan}\n\n"
            "## Environment\n{pkg_hint}\n\n"
            "## CRITICAL Rules\n"
            "1. Follow the blueprint specification EXACTLY — implement every class "
            "and function listed for this file.\n"
            "2. Tensor shapes MUST match the blueprint annotations.\n"
            "3. Imports from dependency files MUST use the exact class/function names "
            "from the already-generated code.\n"
            "4. Every method must have a REAL implementation — no `pass`, no `...`, "
            "no `raise NotImplementedError`.\n"
            "5. NEVER use random numbers as fake metrics.\n"
            "6. For RL code: .detach() ALL values from previous iterations before "
            "using in current loss.\n"
            "7. For neural networks: create layers in __init__, not in forward().\n"
            "8. METHOD RICHNESS: Every non-trivial method should be >=5 lines of "
            "real logic. If a method only calls super() or returns a constant, "
            "add the actual computation it should perform. Training methods should "
            "include proper gradient handling, metric logging, and error checks.\n"
            "9. ABLATION DIFFERENTIATION: If this file contains ablation/variant "
            "classes, each MUST differ in actual algorithm logic — not just in "
            "parameter values or by removing a line. Ablations should clearly "
            "implement a different computational path.\n"
            "10. NO CLI CONDITION ARGS: If this is main.py, NEVER add argparse "
            "arguments like --condition or --method. All conditions must be "
            "iterated inside main.py with a for-loop. The harness runs "
            "`python main.py` with no arguments.\n"
            "11. NUMPY 2.x COMPATIBILITY: np.trapz→np.trapezoid, "
            "np.erfinv→scipy.special.erfinv, np.bool/np.int/np.float→Python builtins, "
            "np.str/np.object→str/object, np.math→math.\n\n"
            "Output ONLY the Python code for `{file_name}` — no markdown fences, "
            "no explanations, just the code."
        ),
        "max_tokens": 8192,
    },
    "code_exec_fix": {
        "system": (
            "You are a debugging expert who fixes runtime errors in Python "
            "experiment code. You preserve the original experiment design and "
            "scientific methodology while fixing the specific error. You fix "
            "the ROOT CAUSE, not just the symptom."
        ),
        "user": (
            "The following experiment code crashed during execution.\n\n"
            "## Error Output (stderr, last 3000 chars)\n"
            "```\n{stderr}\n```\n\n"
            "## Standard Output (last 50 lines)\n"
            "```\n{stdout_tail}\n```\n\n"
            "## Return Code: {returncode}\n\n"
            "## Current Code Files\n{files_context}\n\n"
            "## Instructions\n"
            "1. Identify the ROOT CAUSE of the error.\n"
            "2. Fix it while preserving the experiment design.\n"
            "3. Check for similar potential issues in ALL files.\n"
            "4. Do NOT simplify or remove experiment logic — fix the bug.\n"
            "5. Do NOT add subprocess, os.system, eval, exec, or network calls.\n"
            "6. COMMON BUG: If error is about `train()` missing arguments, it means "
            "a class overrode nn.Module.train() with a custom signature. Fix by "
            "renaming the custom method to `fit()` or `run_training()` and updating "
            "all callers. Never override nn.Module.train/eval with extra args.\n"
            "7. NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, "
            "np.bool/int/float/complex→Python builtins, np.str/object→str/object.\n\n"
            "Output ALL files in ```filename:xxx.py``` format, including files "
            "that don't need changes."
        ),
        "max_tokens": 16384,
    },
    "code_reviewer": {
        "system": (
            "You are a meticulous experiment code reviewer focused on "
            "scientific correctness, statistical rigor, and code quality. "
            "You catch bugs that static analysis cannot: incorrect algorithm "
            "implementations, missing controls, wrong metric computation, "
            "and experimental design flaws."
        ),
        "user": (
            "Review this experiment code for correctness and quality.\n\n"
            "## Research Context\n"
            "TOPIC: {topic}\n"
            "PRIMARY METRIC: {metric}\n\n"
            "## Experiment Plan\n{exp_plan}\n\n"
            "## Code Files\n{files_context}\n\n"
            "## Review Criteria\n"
            "1. **CORRECTNESS**: Does the code correctly implement the "
            "experiment plan? Are algorithms implemented properly?\n"
            "2. **COMPLETENESS**: Are all conditions/ablations implemented "
            "with DISTINCT logic? (Not just renamed copies of baseline.)\n"
            "3. **STATISTICAL RIGOR**: Multiple seeds? Results averaged and "
            "reported with std? Paired comparisons?\n"
            "4. **METRIC REPORTING**: Is {metric} correctly computed and "
            "printed in the required format?\n"
            "5. **ROBUSTNESS**: Shape mismatches? Missing imports? Type "
            "errors? Division by zero? GPU/CPU device conflicts?\n"
            "6. **CLASS DEPTH**: Each experimental condition class must have "
            "at least 20 lines of effective code with distinct logic. Classes "
            "that only override __init__ to change parameters are CRITICAL "
            "issues — they indicate the condition is not truly different.\n\n"
            "## Output Format (JSON)\n"
            "```json\n"
            '{{\n'
            '  "verdict": "APPROVE or REVISE",\n'
            '  "score": 1-10,\n'
            '  "critical_issues": ["issue1", "issue2"],\n'
            '  "suggestions": ["suggestion1", "suggestion2"]\n'
            '}}\n'
            "```\n\n"
            "Only use verdict REVISE if there are critical issues that would "
            "cause the code to crash or produce scientifically invalid results."
        ),
        "json_mode": True,
        "max_tokens": 4096,
    },
}

# -- Stage prompts (one entry per LLM-calling stage) ---------------------

_DEFAULT_STAGES: dict[str, dict[str, Any]] = {
    # ── Phase A: Research Scoping ────────────────────────────────────────
    "topic_init": {
        "system": (
            "You are a rigorous research planner who identifies NOVEL, TIMELY "
            "research angles. You follow recent trends from top venues in the "
            "relevant domain and propose research that advances "
            "the frontier rather than repeating known results.\n\n"
            "NOVELTY PRINCIPLES:\n"
            "- A good research angle addresses a GAP not yet covered by existing work.\n"
            "- Avoid pure benchmark/comparison studies unless the methodology is novel.\n"
            "- Prefer angles that combine existing techniques in new ways, apply methods "
            "to underexplored domains, or challenge common assumptions.\n"
            "- The research must be FEASIBLE with limited compute (single GPU, hours not days).\n"
            "- Check: would a reviewer say 'this is already well-known'? If so, find a sharper angle."
        ),
        "user": (
            "Create a SMART research goal in markdown.\n"
            "Topic: {topic}\n"
            "Domains: {domains}\n"
            "Project: {project_name}\n"
            "Quality threshold: {quality_threshold}\n\n"
            "Required sections:\n"
            "- **Topic**: The broad area\n"
            "- **Novel Angle**: What specific aspect has NOT been well-studied? "
            "Why is this timely NOW (2024-2026)? What recent development creates "
            "an opportunity? How does this differ from standard approaches?\n"
            "- **Scope**: Focused enough for a single paper\n"
            "- **SMART Goal**: Specific, Measurable, Achievable, Relevant, Time-bound\n"
            "- **Constraints**: Compute budget, available tools, data access\n"
            "- **Success Criteria**: What results would make this publishable?\n"
            "- **Generated**: Timestamp\n\n"
            "IMPORTANT: The 'Novel Angle' section must convincingly argue why this "
            "specific research direction is NOT already covered by existing work. "
            "If the topic is well-studied (e.g., 'comparing optimizers'), you MUST "
            "find a specific unexplored aspect (e.g., 'under distribution shift with "
            "noisy gradients', 'in the few-shot regime', 'with modern architectures').\n\n"
            "TREND VALIDATION (MANDATORY):\n"
            "- Identify 2-3 recent papers (2024-2026) that establish the relevance "
            "of this research direction.\n"
            "- Name the specific benchmark/dataset that will be used for evaluation.\n"
            "- If no standard benchmark exists, explain how results will be measured.\n"
            "- State whether SOTA results exist on this benchmark and what they are.\n"
            "- Add a 'Benchmark' subsection listing: name, source, metrics, "
            "current SOTA (if known)."
        ),
    },
    "problem_decompose": {
        "system": "You are a senior research strategist.",
        "user": (
            "Decompose this research problem into at least 4 prioritized "
            "sub-questions.\n"
            "Topic: {topic}\n"
            "Output markdown with sections: Source, Sub-questions, Priority "
            "Ranking, Risks.\n"
            "Goal context:\n{goal_text}"
        ),
    },
    # ── Phase B: Literature Discovery ────────────────────────────────────
    "search_strategy": {
        "system": (
            "You design literature retrieval strategies and source verification plans."
        ),
        "user": (
            "Create a merged search strategy package.\n"
            "Return a JSON object with keys: search_plan_yaml, sources.\n"
            "search_plan_yaml must be valid YAML text.\n"
            "sources must include id,name,type,url,status,query,verified_at.\n"
            "Topic: {topic}\n"
            "Problem tree:\n{problem_tree}"
        ),
        "json_mode": True,
    },
    "literature_collect": {
        "system": "You are a literature mining assistant.",
        "user": (
            "Generate candidate papers from the search plan.\n"
            "Return JSON: {candidates:[...]} with >=8 rows.\n"
            "Each candidate must include id,title,source,url,year,abstract,"
            "collected_at.\n"
            "Topic: {topic}\n"
            "Search plan:\n{plan_text}"
        ),
        "json_mode": True,
    },
    "literature_screen": {
        "system": (
            "You are a strict domain-aware reviewer with zero tolerance for "
            "cross-domain false positives. You MUST reject papers that are "
            "from unrelated fields, even if they share superficial keyword "
            "overlap. A paper about 'normalization in database systems' is "
            "NOT relevant to 'normalization in deep learning'. A paper about "
            "'graph theory in social networks' is NOT relevant to 'graph "
            "neural networks for molecular property prediction'."
        ),
        "user": (
            "Perform merged relevance+quality screening and return shortlist.\n"
            "Return JSON: {shortlist:[...]} each with title, cite_key "
            "(if present), relevance_score (0-1), quality_score (0-1), "
            "keep_reason.\n"
            "Preserve all original fields (paper_id, doi, arxiv_id, cite_key, "
            "etc.) from the input.\n"
            "Topic: {topic}\n"
            "Domains: {domains}\n"
            "Threshold: {quality_threshold}\n\n"
            "SCREENING RULES (apply strictly):\n"
            "1. DOMAIN MATCH: The paper's actual research domain must match "
            "the topic's domain. Shared keywords across domains do NOT count.\n"
            "2. METHOD RELEVANCE: The paper must discuss methods, benchmarks, "
            "or findings directly applicable to the research topic.\n"
            "3. CROSS-DOMAIN REJECTION: Reject papers from unrelated fields "
            "(e.g., wireless communications, database systems, social science) "
            "even if they use similar terminology.\n"
            "4. RECENCY PREFERENCE: Prefer papers from 2020+ for methodology, "
            "but accept foundational papers (pre-2020) if they introduced key "
            "techniques still in use today.\n"
            "5. SEMINAL PAPERS: Papers marked as source='seminal_library' are "
            "pre-vetted foundational references — keep them if their keywords "
            "match the topic (relevance_score >= 0.7).\n"
            "6. QUALITY FLOOR: Reject papers with no abstract, no venue, and "
            "no citation count (likely not real papers).\n"
            "Candidates JSONL:\n{candidates_text}"
        ),
        "json_mode": True,
    },
    "knowledge_extract": {
        "system": "You extract high-signal evidence cards from papers.",
        "user": (
            "Extract structured knowledge cards from shortlist.\n"
            "Return JSON: {cards:[{card_id,title,cite_key,problem,method,"
            "data,metrics,findings,limitations,citation}]}.\n"
            "IMPORTANT: If the input contains cite_key fields, preserve them "
            "exactly in the output.\n"
            "Shortlist:\n{shortlist}"
        ),
        "json_mode": True,
    },
    # ── Phase C: Knowledge Synthesis ─────────────────────────────────────
    "synthesis": {
        "system": "You are a synthesis specialist for literature reviews.",
        "user": (
            "Produce merged synthesis output (topic clusters + research gaps).\n"
            "Output markdown with sections: Cluster Overview, Cluster 1..N, "
            "Gap 1..N, Prioritized Opportunities.\n"
            "Topic: {topic}\n"
            "Cards context:\n{cards_context}"
        ),
        "max_tokens": 8192,
    },
    "hypothesis_gen": {
        "system": (
            "You formulate testable scientific hypotheses that address gaps "
            "NOT covered by existing literature. Your hypotheses must be:\n"
            "1. NOVEL: Not simply replicating known results or testing obvious things.\n"
            "2. GAP-FILLING: Address specific weaknesses or blind spots identified "
            "in the literature synthesis.\n"
            "3. FEASIBLE: Testable with limited compute (single GPU, <1 day runtime).\n"
            "4. FALSIFIABLE: Have clear failure conditions that would definitively "
            "reject the hypothesis.\n"
            "5. SURPRISING: At least one hypothesis should challenge conventional "
            "wisdom or test a counter-intuitive prediction."
        ),
        "user": (
            "Generate at least 2 falsifiable hypotheses from the synthesis below.\n"
            "For each hypothesis provide:\n"
            "- **Hypothesis statement**: A clear, testable claim\n"
            "- **Novelty argument**: Why this has NOT been tested before, citing "
            "specific gaps from the synthesis\n"
            "- **Rationale**: Theoretical or empirical basis for expecting this result\n"
            "- **Measurable prediction**: Specific quantitative outcome expected\n"
            "- **Failure condition**: What result would reject this hypothesis?\n"
            "- **Required baselines**: What modern, state-of-the-art methods must be "
            "compared against to make the finding meaningful?\n\n"
            "AVOID:\n"
            "- Hypotheses that are trivially obvious (e.g., 'more data improves accuracy')\n"
            "- Hypotheses that replicate well-known results already in the literature\n"
            "- Hypotheses that cannot be tested within the compute budget\n\n"
            "Synthesis:\n{synthesis}"
        ),
    },
    # ── Phase D: Experiment Design ───────────────────────────────────────
    "experiment_design": {
        "system": "You are a principal investigator designing rigorous research experiments.",
        "user": (
            "{preamble}\n\n"
            "Design an experiment plan as YAML.\n"
            "Required keys: objectives,datasets,baselines,proposed_methods,"
            "ablations,metrics,risks,compute_budget.\n\n"
            "NAMING REQUIREMENT (CRITICAL for paper quality):\n"
            "- Every condition name in baselines, proposed_methods, and ablations MUST be "
            "a DESCRIPTIVE algorithm name DERIVED FROM THE HYPOTHESES ABOVE, NOT a generic label.\n"
            "- WRONG: baseline_1, baseline_2, method_variant_1, method_variant_2\n"
            "- WRONG: random_search, bayesian_optimization, ppo_policy, curiosity_driven_rl "
            "(these are generic defaults — NEVER use them unless they are actually what "
            "the hypotheses call for)\n"
            "- RIGHT: names that reflect the specific methods/architectures/algorithms in "
            "the hypotheses (e.g., rim_agent, monolithic_gru, ewc_baseline, sleep_consolidation, "
            "no_sleep_ablation, coarse_routing, fine_routing)\n"
            "- The name should immediately tell a reader WHAT algorithm or strategy is used.\n"
            "- This is critical because these names appear directly in the paper.\n\n"
            "BASELINE & BENCHMARK MODERNITY (CRITICAL for acceptance):\n"
            "- Baselines MUST be modern, widely-adopted methods from recent top-venue "
            "papers (2023-2026). Beating only outdated or weak baselines is NOT a valid "
            "contribution and will result in desk rejection.\n"
            "- Include at LEAST one strong baseline that represents current SOTA or "
            "near-SOTA in the specific sub-area. Check recent NeurIPS/ICML/ICLR papers "
            "to identify appropriate baselines.\n"
            "- Benchmarks MUST be standard and actively used. If a benchmark has been "
            "superseded, use the newer version.\n"
            "- For each baseline, cite the original paper and note why it is a fair "
            "and competitive comparison.\n\n"
            "HYPOTHESIS ALIGNMENT (CRITICAL — most common failure mode):\n"
            "- Your experiment plan MUST directly test the hypotheses listed above.\n"
            "- Each hypothesis should map to at least one comparison between conditions.\n"
            "- Baselines must be the specific alternatives named in the hypotheses, NOT "
            "generic optimization methods like random_search or bayesian_optimization.\n"
            "- If a hypothesis says 'X outperforms Y', then X must be a proposed_method "
            "and Y must be a baseline.\n"
            "- Ablations must isolate the specific components claimed to matter in the "
            "hypotheses (e.g., if hypothesis claims routing helps, ablate routing).\n\n"
            "STABILITY & REPRODUCIBILITY (CRITICAL for RL-based methods):\n"
            "- Under `proposed_methods`, specify key hyperparameters (learning rate, "
            "gradient clip threshold, entropy coefficient, etc.).\n"
            "- Under `risks`, explicitly list numerical stability concerns "
            "(NaN/divergence, reward explosion, policy collapse) and mitigations "
            "(gradient clipping, reward normalization, early stopping on NaN).\n"
            "- Under `metrics`, include:\n"
            "  * Primary metric: `{metric_key}` with direction: `{metric_direction}` "
            "and units\n"
            "  * IMPORTANT: The metric direction MUST be `{metric_direction}` — do "
            "NOT use a different direction. If {metric_direction}=='minimize', lower "
            "is better. If {metric_direction}=='maximize', higher is better.\n"
            "  * `success_rate`: fraction of seeds that complete without NaN/crash\n"
            "  * At least ONE discovery-aligned endpoint (e.g., identification "
            "accuracy, time-to-discovery, final posterior mass on true hypothesis) "
            "in addition to any proxy metric\n"
            "{dataset_guidance}\n\n"
            "- Under `datasets`, specify AT LEAST 2 regime factors to stratify by "
            "(e.g., noise_level: [low, high], hypothesis_space_size: [small, large]). "
            "Results MUST be reported per-regime. A single-regime experiment cannot "
            "support generality claims and will be rejected by reviewers.\n"
            "- FACTORIAL DESIGN PREFERRED: If you vary multiple factors (e.g., scale AND "
            "noise), design a factorial grid (e.g., small+low, small+high, large+low, "
            "large+high) so each factor's effect can be isolated. Bundling factors "
            "(e.g., easy=small+low, hard=large+high) is a confounder and reviewers will "
            "flag it. If computational budget limits the grid, at minimum acknowledge "
            "that factors are bundled and limit claims accordingly.\n"
            "- Under `compute_budget`, plan for minimum 10 seeds per condition to "
            "ensure valid statistical comparisons.\n\n"
            "STATISTICAL POWER REQUIREMENTS (CRITICAL for publishability):\n"
            "- Use AT LEAST 5 random seeds per condition (10 preferred)\n"
            "- Use AT LEAST 30 episodes per seed for RL methods\n"
            "- Report: mean ± std, 95% bootstrap CI, per-seed raw values\n"
            "- For method comparisons: use paired bootstrap or Wilcoxon signed-rank test "
            "(NOT paired t-test with n < 10)\n"
            "- Report effect sizes (Cohen's d or rank-biserial correlation)\n"
            "- 3 seeds is INSUFFICIENT — reviewers will reject papers with n=3\n\n"
            "HARDWARE ENVIRONMENT (your experiments run on THIS exact machine):\n"
            "{hardware_profile}\n"
            "- You have exactly ONE GPU. No distributed training. No multi-GPU. No multi-node.\n"
            "- Design experiments that fit this single GPU.\n\n"
            "COMPUTE BUDGET CONSTRAINT (CRITICAL — experiments MUST fit time budget):\n"
            "- Total experiment time budget: {time_budget_sec} seconds.\n"
            "- Per-condition budget: ~{per_condition_budget_sec} seconds "
            "(= time_budget × 0.7 / 6 conditions).\n"
            "- Pre-cached datasets (instant, no download): {available_tier1_datasets}\n"
            "- DO NOT plan experiments requiring multiple GPUs or more than "
            "{time_budget_sec}s.\n"
            "- HARD CONDITION LIMIT: The total number of conditions (baselines + "
            "proposed_methods + ablations) MUST NOT exceed 8 for budgets ≤ 3600s.\n"
            "  * Recommended: 2-3 baselines + 1-2 proposed methods + 2-3 ablations = 5-8 total.\n"
            "  * Generating 10+ conditions guarantees most will time out and data will be wasted.\n"
            "  * Quality over quantity: 6 well-run conditions with 5 seeds each >> 20 conditions "
            "with 1 seed each.\n"
            "- Each run needs AT LEAST 60 seconds for RL (environment setup + "
            "training + evaluation). For deep learning with GPU, at least 120 seconds.\n"
            "- HARD CAP: total_conditions × num_seeds × seconds_per_run MUST be < "
            "{time_budget_sec} × 0.8 (leave 20% margin for overhead).\n"
            "- If total would exceed the budget, you MUST reduce by:\n"
            "  1. First: reduce conditions (merge similar ablations, keep strongest baselines)\n"
            "  2. Then: reduce seeds to 5 (minimum for statistical validity)\n"
            "  3. Then: reduce regimes/environments to 1\n"
            "- Example: {time_budget_sec}s budget with 120s/condition/seed, 5 seeds → "
            "max {time_budget_sec} / (120 * 5) ≈ 4 conditions.\n\n"
            "IMPLEMENTATION SPECIFICATION (CRITICAL for code generation):\n"
            "For each proposed method AND each baseline, you MUST include an "
            "'implementation_spec' key with:\n"
            "  - class_name: the Python class name for this method\n"
            "  - key_methods: list of methods the class must implement "
            "(e.g., [__init__, forward, train_step, predict])\n"
            "  - algorithm_steps: pseudocode-level description of the core algorithm "
            "(3-10 steps), e.g.:\n"
            "    1. Encode input via encoder network (MLP: input_dim -> hidden_dim)\n"
            "    2. Compute attention weights over memory buffer\n"
            "    3. Aggregate attended features with learned gate\n"
            "    4. Decode to output via decoder network\n"
            "  - loss_function: the mathematical formula for the training loss "
            "(e.g., 'L = CE(y_pred, y_true) + lambda * KL(q||p)')\n"
            "  - key_hyperparameters: dict of hyperparameter name -> default value\n"
            "  - differentiator: what makes THIS method different from others "
            "(must be an algorithmic difference, not just a hyperparameter change)\n\n"
            "For each ablation, you MUST specify:\n"
            "  - what_is_removed: the specific component being ablated\n"
            "  - how_it_differs: concrete code-level description of the change "
            "(e.g., 'replace attention layer with mean pooling', 'set routing "
            "weight to uniform 1/N', 'remove skip connection in block 3')\n"
            "  - expected_effect: why removing this should change results\n\n"
            "This specification is MANDATORY — without it, the code generation "
            "stage cannot produce correct implementations.\n\n"
            "Hypotheses:\n{hypotheses}"
        ),
    },
    "code_generation": {
        "system": (
            "You are a computational scientist who writes real, runnable "
            "experiments. Your code implements actual algorithms with real "
            "mathematical operations. You NEVER fake results with random number "
            "generators. Always use the ```filename:xxx.py format for each file. "
            "Use numpy for numerical computation. Keep code self-contained "
            "and deterministic."
        ),
        "user": (
            "Generate a Python experiment project for the following research "
            "topic:\n"
            "TOPIC: {topic}\n\n"
            "CRITICAL REQUIREMENTS — your code MUST satisfy ALL of these:\n"
            "1. Implement the ACTUAL experiment described in the topic and "
            "plan below.\n"
            "   If the topic is about simulation (e.g., multi-agent systems, "
            "network dynamics),\n"
            "   write simulation code. If about optimization, write "
            "optimization code.\n"
            "   Match the code to the topic — do NOT default to generic "
            "gradient descent.\n"
            "2. Use proper mathematical models appropriate to the research "
            "question.\n"
            "   Examples: agent-based simulation, graph algorithms, "
            "statistical analysis,\n"
            "   optimization, Monte Carlo methods — whatever fits the topic.\n"
            "3. Run REAL computational experiments with meaningful "
            "parameters.\n"
            "4. Collect REAL metrics that directly answer the research "
            "question.\n"
            "5. The code must be scientifically meaningful — a reviewer should "
            "see\n"
            "   actual implementations relevant to the TOPIC, not a generic "
            "optimizer.\n\n"
            "OUTPUT FORMAT — return multiple files using this exact format:\n"
            "```filename:main.py\n"
            "# entry point code\n"
            "```\n\n"
            "```filename:models.py\n"
            "# model/algorithm implementations\n"
            "```\n\n"
            "Only create additional files (optimizers.py, data_utils.py, etc.) "
            "if they contain substantial logic (>20 lines). Do NOT create stub "
            "files with only imports or pass statements.\n\n"
            "CODE STRUCTURE:\n"
            "- main.py: entry point that runs experiments and prints metrics\n"
            "- main.py MUST begin with a docstring specifying:\n"
            "  (a) Dataset used and how it is loaded\n"
            "  (b) Distribution shift / corruption definition (if applicable)\n"
            "  (c) Model architecture (layers, dimensions, activation)\n"
            "  (d) Training protocol (optimizer, epochs, batch size, LR schedule)\n"
            "  (e) Evaluation protocol (train/test split, metrics computed)\n"
            "- Additional modules for algorithms, objective functions, "
            "utilities\n"
            "- Primary metric key: {metric}\n"
            "- main.py must print metric lines as `name: value` (one per "
            "line)\n"
            "- Use deterministic seeds (numpy.random.seed or random.seed)\n"
            "- No external data files, no network calls, no GPU required\n"
            "- FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket\n"
            "{pkg_hint}\n"
            "ANTI-PATTERNS (do NOT do these):\n"
            "- Do NOT generate random numbers and pretend they are experiment "
            "results\n"
            "- Do NOT use `random.uniform()` to simulate a decreasing loss "
            "curve\n"
            "- Do NOT hardcode metric values or use trivial arithmetic as "
            "metrics\n\n"
            "MULTI-CONDITION REQUIREMENT (CRITICAL):\n"
            "The experiment plan below specifies multiple conditions, treatments, "
            "or strategies to compare. Your code MUST:\n"
            "1. Implement ALL conditions/treatments listed in the experiment plan "
            "— not just one baseline.\n"
            "2. Run each condition independently with the same controlled setup "
            "(same seeds, same initialization, same budget).\n"
            "   IMPORTANT: All conditions MUST be iterated INSIDE main.py using a "
            "for-loop or dispatch table. NEVER use argparse --condition or any CLI "
            "argument to select a condition. The harness calls `python main.py` "
            "with NO arguments — if you add a required --condition arg it will crash.\n"
            "3. Print metrics with condition labels: "
            "`condition=<name> {metric}: <value>` for EACH condition.\n"
            "4. After all conditions, print a summary comparison line: "
            "`SUMMARY: condition1=<val>, condition2=<val>, ...`\n"
            "5. If the plan has N conditions, the output MUST contain N separate "
            "labeled metric streams. Running only one condition is NOT acceptable.\n"
            "6. BREADTH-FIRST ORDERING: Run ONE representative configuration per "
            "condition FIRST (e.g., default parameters), so that ALL conditions "
            "produce at least one result. Only AFTER all conditions have results, "
            "run additional parameter sweeps if time remains. This prevents the "
            "time budget from being exhausted on condition 1's parameter sweep "
            "while conditions 2..N never execute.\n"
            "7. CONDITION COMPLETENESS: After code generation, mentally verify that "
            "EVERY condition in the experiment plan below has a corresponding code "
            "path. If the plan lists conditions A, B, C, D — your code must handle "
            "all four, not just A, B, C. Missing conditions invalidate the experiment.\n"
            "8. CRASH RESILIENCE: Wrap each condition's execution in a try/except "
            "block so that if one condition crashes (e.g., NaN, timeout, config error), "
            "the remaining conditions still execute. Print `CONDITION_FAILED: <name> "
            "<error>` on failure and continue to the next condition. A partial result "
            "set is far more valuable than a complete crash.\n"
            "9. CONDITION REGISTRY VALIDATION: At startup (before running experiments), "
            "enumerate all condition names and verify each has a valid code path. Print "
            "`REGISTERED_CONDITIONS: <name1>, <name2>, ...` at the top of output. If "
            "any condition is unrecognized, print `MISSING_CONDITION: <name>` and skip "
            "it gracefully rather than raising an exception.\n"
            "10. TOTAL CONDITIONS LIMIT (HARD RULE): Your code MUST NOT register more "
            "than 8 total conditions. If the experiment plan lists ablations with many "
            "parameter values (e.g., 'test decay rates 0.9, 0.99, 0.995, 0.999, 0.9999'), "
            "pick the 2-3 most informative values — do NOT create a separate condition for "
            "each value. 8 conditions × 3 seeds × budget ÷ conditions = tight timing. "
            "Quality of each condition matters more than quantity.\n\n"
            "METRIC DEFINITION REQUIREMENT (CRITICAL):\n"
            "- At the top of main.py, include a docstring or comment block that defines:\n"
            "  * METRIC NAME: the exact key printed as `{metric}: <value>`\n"
            "  * DIRECTION: {metric_direction_hint}\n"
            "  * UNITS/SCALE: what the number represents (e.g., MSE in log scale, "
            "accuracy 0-1, discovery rate per episode)\n"
            "  * FORMULA: how the metric is computed from raw experiment outputs\n"
            "  * AGGREGATION: how per-step/per-episode values are reduced to a scalar\n"
            "- Print this definition at runtime: `METRIC_DEF: {metric} | direction=<higher/lower> "
            "| desc=<one-line description>`\n"
            "- Without this definition, the metric is UNINTERPRETABLE and the paper cannot "
            "make any claims about which method is better.\n\n"
            "STATISTICAL RIGOR REQUIREMENT:\n"
            "- Run each condition with at least 5 different random seeds (10+ preferred "
            "if time budget allows). Minimum 3 seeds is MANDATORY.\n"
            "- Print per-seed results: `condition=<name> seed=<s> {metric}: <value>`\n"
            "- Print mean and std across seeds: "
            "`condition=<name> {metric}_mean: <val> {metric}_std: <val>`\n"
            "- If time budget is tight, reduce per-seed iterations rather than "
            "reducing seed count. Minimum 3 seeds is non-negotiable.\n"
            "- SEED COUNT IS FIXED AT 3 MINIMUM. Do NOT compute seed count dynamically.\n"
            "  Hardcode `SEEDS = [0, 1, 2]`. If 3 seeds × all conditions exceeds the time "
            "budget, REDUCE the number of conditions or training epochs — NEVER reduce seeds.\n"
            "  Print: `SEED_COUNT: 3 (fixed minimum, budget={time_budget}s, conditions=N)`.\n"
            "- Report bootstrap 95% confidence intervals when n >= 5.\n\n"
            "FAILURE-AWARE REPORTING (CRITICAL for RL/unstable methods):\n"
            "- Track how many seeds succeed vs fail (NaN, divergence, crash) per "
            "condition. Print: `condition=<name> success_rate: <succeeded>/<total>`\n"
            "- Compute UNCONDITIONAL metrics: treat failed seeds as worst-case "
            "(e.g., metric=0 or metric=worst_baseline). Print: "
            "`condition=<name> unconditional_{metric}_mean: <val>`\n"
            "- This prevents survivorship bias where a method looks good only "
            "because failed runs are excluded.\n"
            "- For RL methods, add STABILITY SAFEGUARDS in the code:\n"
            "  * Gradient clipping (max norm 1.0)\n"
            "  * Reward normalization/clipping to [-10, 10]\n"
            "  * NaN checks on loss/gradients with graceful early stop (not crash)\n"
            "  * Learning rate warmup or conservative initial learning rate\n"
            "  These safeguards should PREVENT most NaN/divergence, not just catch "
            "them after the fact.\n\n"
            "PYTORCH RL IMPLEMENTATION BUGS (CRITICAL — these cause 100% crash rate):\n"
            "- 'Trying to backward through the graph a second time' is the #1 crash.\n"
            "  CAUSE: reusing a computed tensor across multiple backward() calls.\n"
            "  FIX: Always .detach() values used in the next iteration:\n"
            "  ```\n"
            "  # WRONG:\n"
            "  old_log_prob = policy.log_prob(action)  # still attached to graph\n"
            "  # ... later in update loop:\n"
            "  ratio = new_log_prob / old_log_prob  # backward crashes\n"
            "  \n"
            "  # CORRECT:\n"
            "  old_log_prob = policy.log_prob(action).detach()  # detach!\n"
            "  # ... later in update loop:\n"
            "  ratio = new_log_prob / old_log_prob.detach()  # safe\n"
            "  ```\n"
            "- For PPO: old_log_probs MUST be .detach()ed when stored for later ratio computation.\n"
            "- For value functions: target values MUST be .detach()ed (don't backprop through targets).\n"
            "- For curiosity/intrinsic reward: prediction errors used as reward MUST be .detach()ed.\n"
            "- General rule: any tensor from a PREVIOUS forward pass that is used in the CURRENT "
            "loss computation MUST be .detach()ed.\n"
            "- When in doubt, add .detach() — it never causes crashes, but missing it always does.\n\n"
            "NEURAL NETWORK DIMENSION CONSISTENCY (CRITICAL — #2 crash cause):\n"
            "- 'input and weight.T shapes cannot be multiplied' means obs_dim != network input_dim.\n"
            "- When the environment observation size VARIES across regimes (e.g., easy=6, hard=8), "
            "the neural network's input layer MUST match EACH regime's obs_dim.\n"
            "- FIX: Create the network INSIDE the per-regime loop, or parameterize input_dim:\n"
            "  ```\n"
            "  # WRONG: fixed input_dim for all regimes\n"
            "  policy = PolicyNet(input_dim=10)  # breaks if obs_dim != 10\n"
            "  for regime in regimes:\n"
            "      obs = env.reset()  # obs.shape may vary!\n"
            "  \n"
            "  # CORRECT: dynamic input_dim per regime\n"
            "  for regime in regimes:\n"
            "      obs = env.reset()\n"
            "      obs_dim = obs.shape[-1]  # or len(obs)\n"
            "      policy = PolicyNet(input_dim=obs_dim)  # fresh network per regime\n"
            "  ```\n"
            "- ALWAYS initialize neural networks AFTER knowing the observation dimension.\n\n"
            "KNOWLEDGE DISTILLATION (KD) STABILITY (if applicable):\n"
            "- Teacher network MUST be frozen: `teacher.eval()` and "
            "`for p in teacher.parameters(): p.requires_grad = False`\n"
            "- Temperature parameter T: typical range 1-20. Use T=4 as default. "
            "NEVER use T<1 (causes sharp distributions → NaN gradients).\n"
            "- Loss balance: `loss = alpha * kd_loss + (1-alpha) * task_loss` — "
            "set alpha=0.5-0.9. If kd_loss scale >> task_loss, val_loss becomes NaN.\n"
            "- PROJECTION LAYERS: If teacher and student have different intermediate "
            "dimensions (e.g., teacher_dim=768, student_dim=256), you MUST add "
            "`nn.Linear(student_dim, teacher_dim)` to align features before computing "
            "distillation loss. Without projection layers, tensor shape mismatch WILL crash.\n"
            "- Common KD NaN causes: (1) no temperature scaling on logits, "
            "(2) missing gradient clipping, (3) learning rate too high (use ≤1e-3), "
            "(4) teacher not frozen → unstable targets.\n\n"
            "PAIRED STATISTICAL ANALYSIS (CRITICAL for publishable results):\n"
            "- Use the SAME random seeds across all conditions so results are paired.\n"
            "- After collecting per-seed results for all conditions, compute paired "
            "differences: for each seed s, diff(s) = method(s) - baseline(s).\n"
            "- Print paired analysis: "
            "`PAIRED: <method> vs <baseline> mean_diff=<val> std_diff=<val> "
            "t_stat=<val> p_value=<val>`\n"
            "- Also print bootstrap 95% CI of the paired difference.\n"
            "- This is FAR more powerful than independent comparisons because it "
            "controls for seed-to-seed variance.\n\n"
            "MULTI-REGIME REQUIREMENT (CRITICAL for generality claims):\n"
            "- The experiment MUST test at least 2 different difficulty/noise regimes "
            "(e.g., low noise vs high noise, small hypothesis space vs large).\n"
            "- Report results per-regime, not just aggregated across regimes.\n"
            "- Print regime labels: "
            "`condition=<name> regime=<regime_name> {metric}: <value>`\n"
            "- This prevents conclusions that only hold in one setting from being "
            "presented as general findings.\n\n"
            "DIMENSION CONSISTENCY CHECK (CRITICAL for RL/neural methods):\n"
            "- Before passing observations/states to neural networks or policy "
            "parameters, VERIFY that dimensions match. Common bug: environment "
            "state has dimension D1 but network expects D2.\n"
            "- At the start of each condition, print the state/observation "
            "dimension and the network input dimension. If they mismatch, "
            "reshape or adjust the network before proceeding.\n"
            "- Test EVERY condition with a single dry-run step before the full "
            "loop to catch shape mismatches early.\n\n"
            "TIME-TO-EVENT METRIC BUG PREVENTION (CRITICAL — common silent bug):\n"
            "- If the primary metric is a 'time-to-X' measure (e.g., time-to-discovery, "
            "steps-to-convergence, episodes-to-threshold), you MUST check the success "
            "criterion at EVERY step inside the loop, not only at the end.\n"
            "- WRONG pattern (produces degenerate ceiling data):\n"
            "  ```\n"
            "  for t in range(horizon):\n"
            "      obs, r, done, info = env.step(a)\n"
            "  success = check(info)  # only checked ONCE at end\n"
            "  time_to_X = horizon if not success else t + 1  # t+1 = horizon always!\n"
            "  ```\n"
            "- CORRECT pattern (captures actual first-success time):\n"
            "  ```\n"
            "  time_to_X = horizon  # default: never succeeded\n"
            "  for t in range(horizon):\n"
            "      obs, r, done, info = env.step(a)\n"
            "      if check(info) and time_to_X == horizon:  # first success\n"
            "          time_to_X = t + 1\n"
            "      if done: break\n"
            "  ```\n"
            "- This bug causes ALL methods to return the same ceiling value, making "
            "the entire experiment useless. Every method looks identical at the cap.\n"
            "- APPLY THIS TO ALL CONDITIONS: RandomSearch, BO, RL — every single "
            "condition must check at every step. If even one condition uses the wrong "
            "pattern, the comparison is invalid.\n\n"
            "METRIC DISCRIMINATION VALIDATION (CRITICAL):\n"
            "- After running all conditions, check if all conditions produce the SAME "
            "mean metric value. If they do, the metric is NOT discriminative and the "
            "experiment is scientifically useless.\n"
            "- Common causes: ceiling/floor effects, too-easy or too-hard tasks, "
            "time-to-event bug above, metric that doesn't capture real differences.\n"
            "- If all conditions have identical means, print "
            "`WARNING: DEGENERATE_METRICS all conditions have same mean=<val>` "
            "and you MUST take corrective action:\n"
            "  (a) If all means = 1.0 or max: increase task difficulty (reduce budget, "
            "increase noise, enlarge hypothesis space)\n"
            "  (b) If all means = 0.0: decrease difficulty\n"
            "  (c) Re-run after adjustment and verify means now differ\n"
            "  (d) If adjustments don't help, switch to a different primary metric\n"
            "- A degenerate experiment CANNOT produce a publishable paper. Fix it.\n\n"
            "DIFFICULTY CALIBRATION (CRITICAL for meaningful results):\n"
            "- After running a pilot (3-5 seeds, 2 conditions: random_search + one RL), "
            "check BOTH success rate AND metric discrimination.\n"
            "- TWO things must be true for the experiment to be informative:\n"
            "  1. Success rate between 30-80% (not too hard, not too easy)\n"
            "  2. Primary metric varies across conditions (not all methods score the same)\n"
            "- CEILING DETECTION (CRITICAL): If primary_metric is 1.0 (or max possible) "
            "for ALL pilot seeds in ALL pilot conditions, the task is TRIVIALLY EASY. "
            "You MUST increase difficulty until the metric varies. Options:\n"
            "  * Reduce experiment budget/horizon (fewer steps to find solution)\n"
            "  * Increase hypothesis space size\n"
            "  * Increase observation noise\n"
            "  * Tighten the success criterion (e.g., require closer match)\n"
            "  * Reduce the number of allowed experiments per episode\n"
            "- FLOOR DETECTION: If primary_metric is 0.0 for all conditions, task is "
            "too hard. Reduce noise, enlarge budget, simplify.\n"
            "- Print `CALIBRATION: regime=<name> pilot_success_rate=<val> "
            "pilot_primary_metric_std=<val>` after calibration.\n"
            "- If std=0, the metric is NOT discriminative — adjust until std > 0.\n"
            "- Run a calibration loop: pilot → check → adjust → re-pilot (max 3 iterations).\n\n"
            "ALGORITHM IMPLEMENTATION INTEGRITY (CRITICAL — mismatch = academic fraud):\n"
            "1. If you name a method 'Bayesian Optimization', you MUST implement:\n"
            "   - A surrogate model (e.g., Gaussian Process or random forest)\n"
            "   - An acquisition function (e.g., Expected Improvement, UCB)\n"
            "   - Surrogate model updates after each observation\n"
            "   DO NOT implement UCB1 bandit and call it 'Bayesian Optimization'.\n"
            "2. If you name a method 'PPO', you MUST implement:\n"
            "   - A clipped surrogate objective: min(r_t * A_t, clip(r_t, 1-eps, 1+eps) * A_t)\n"
            "   - A learned value function baseline\n"
            "   - The clip_eps parameter MUST be used in the policy update\n"
            "   DO NOT implement vanilla REINFORCE and call it 'PPO'.\n"
            "3. Every declared hyperparameter MUST be used in the algorithm:\n"
            "   - If you declare clip_eps, it must appear in the loss computation\n"
            "   - If you declare entropy_coef, it must be added to the policy loss\n"
            "   - Dead parameters (declared but never used) are strictly forbidden\n"
            "4. Ablation conditions MUST produce different behavior:\n"
            "   - Two conditions that differ only in a parameter that is never read are IDENTICAL\n"
            "   - Verify: if two conditions produce identical outputs on the same seed, "
            "the ablation is broken and MUST be fixed\n"
            "   ABLATION DESIGN PATTERN (CRITICAL — #1 cause of broken ablations):\n"
            "   - 'no_key_component': Must REMOVE a core algorithmic component "
            "(e.g., disable the graph structure by zeroing the adjacency, or remove "
            "the contrastive loss, or disable the RL policy and use random actions). "
            "The removal MUST change the forward() / step() computation.\n"
            "   - 'reduced_capacity': Must REDUCE model capacity by at least 2x "
            "(e.g., halve hidden dimensions, reduce layers, shrink embedding size). "
            "This MUST create a new model with different architecture, NOT just "
            "rename a parameter with the same value.\n"
            "   - SELF-TEST: After implementing ablations, add a startup check that "
            "runs one forward pass per condition on the SAME input and asserts outputs "
            "differ. Print: `ABLATION_CHECK: <name1> vs <name2> outputs_differ=True`.\n"
            "   - If outputs are identical, the ablation is BROKEN — do not proceed.\n\n"
            "CODE IMPLEMENTATION DEPTH (CRITICAL — shallow code = reject):\n"
            "- Each algorithm/method MUST be a separate Python class with genuine logic.\n"
            "- Each class MUST have at least: __init__(), and one core method "
            "(forward/predict/train_step/step) with non-trivial implementation.\n"
            "- The core method of the MAIN proposed method MUST be at least 20 lines "
            "of effective code (excluding comments, blanks, imports).\n"
            "- FORBIDDEN patterns that will be detected and rejected:\n"
            "  * `class MethodB(MethodA): pass` — empty subclass\n"
            "  * Two classes with identical method bodies but different names\n"
            "  * nn.Linear/nn.Conv2d created inside forward() instead of __init__()\n"
            "  * Variables defined only inside an if-branch but used after the branch\n"
            "  * Using np.erf() (doesn't exist — use scipy.special.erf or math.erf)\n"
            "  * Using ndarray.ptp() (removed in NumPy 2.0 — use np.ptp(arr) or arr.max()-arr.min())\n"
            "  * Using np.bool, np.int, np.float, np.complex (removed in NumPy 2.0 — use np.bool_, np.int64, etc.)\n"
            "  * Replacing real model training with synthetic utility functions or random scores\n"
            "  * Using dict[key] without ensuring key exists — use dict.get(key, default) "
            "or verify key is in dict before access\n"
            "- If the experiment plan includes 'implementation_spec', you MUST follow "
            "the pseudocode steps exactly. Each algorithm_step should correspond to "
            "1-3 lines of code in the class.\n"
            "- Ablation variants MUST modify the forward() or step() logic, not just "
            "change a hyperparameter value.\n\n"
            "MINIMUM SEED COUNT (CRITICAL — 3 seeds = unpublishable):\n"
            "- Use AT LEAST 5 random seeds per condition (10 preferred if time permits)\n"
            "- Use AT LEAST 30 episodes per seed for RL methods\n"
            "- When computing bootstrap CIs, use at least 1000 bootstrap samples\n"
            "- For method comparisons: use paired bootstrap or Wilcoxon signed-rank test\n"
            "- Report effect sizes (Cohen's d) alongside p-values\n\n"
            "Experiment plan:\n{exp_plan}"
        ),
        "max_tokens": 8192,
    },
    "resource_planning": {
        "system": "You are an experiment scheduler.",
        "user": (
            "Create schedule JSON with GPU/time estimates.\n"
            "Schema: {tasks:[{id,name,depends_on,gpu_count,estimated_minutes,"
            "priority}], total_gpu_budget, generated}.\n"
            "Experiment plan:\n{exp_plan}"
        ),
        "json_mode": True,
    },
    # ── Phase F: Analysis & Decision ─────────────────────────────────────
    "result_analysis": {
        "system": (
            "You are a quantitative research analyst. Always cite exact numbers "
            "from the provided data."
        ),
        "user": (
            "{preamble}\n\n"
            "{data_context}\n\n"
            "Analyze run metrics and produce markdown report with statistical "
            "interpretation.\n"
            "Use the ACTUAL quantitative values provided above — do NOT invent "
            "numbers.\n\n"
            "SANITY CHECKS (perform BEFORE interpreting results):\n"
            "1. MONOTONICITY: If a condition scales a parameter (e.g., N agents, "
            "model size), check whether metrics move in the expected direction. "
            "If accuracy *decreases* when adding more agents under majority voting, "
            "flag this as a likely implementation bug (vote parsing, normalization, "
            "or aggregation issue).\n"
            "2. BASELINE PLAUSIBILITY: Random-chance baselines should match "
            "theoretical expectations (e.g., 1/K for K-class classification).\n"
            "3. CROSS-CONDITION CONSISTENCY: Results across datasets or conditions "
            "should be internally coherent — wildly different patterns may indicate "
            "confounds or bugs.\n"
            "4. REPLICATION: If results are from a single seed (n=1), explicitly "
            "note that no statistical significance claims can be made.\n"
            "5. ABLATION ISOLATION: Compare per-seed values across conditions. If "
            "two conditions produce IDENTICAL values for the same seed, this is a "
            "RED FLAG — the ablation/variant may not have actually changed the code "
            "path (e.g., config not applied, caching, shared state). Flag this "
            "explicitly and recommend a config/registry audit.\n"
            "6. METRIC DEFINITION CHECK: Look for a `METRIC_DEF:` line in the output. "
            "If absent, flag that the primary metric is UNDEFINED — direction, units, "
            "and formula are unknown, making all comparisons uninterpretable. This is "
            "a critical methodology gap.\n"
            "7. CONDITION COMPLETENESS CHECK: Look for `REGISTERED_CONDITIONS:` in "
            "the output. Compare against the experiment plan. If conditions are missing "
            "or failed (look for `CONDITION_FAILED:`), list them explicitly and assess "
            "whether the remaining conditions can still answer the research question.\n"
            "8. DEGENERATE METRICS CHECK: If ALL conditions (or all but one) produce "
            "the SAME mean primary metric value, flag this as DEGENERATE — the metric "
            "is NOT discriminative. Common causes: (a) time-to-event metric that only "
            "checks success at the final step (returns horizon for all methods), "
            "(b) ceiling/floor effects from too-easy or too-hard tasks, "
            "(c) metric capped at a budget value. This makes the experiment "
            "scientifically useless — recommend REFINE with a note to fix the metric "
            "computation or task difficulty. Look for `WARNING: DEGENERATE_METRICS` "
            "in stdout. Even if not printed, check the numbers yourself.\n\n"
            "Required sections: Metrics Summary (with real values), "
            "Consensus Findings (high confidence), "
            "Contested Points (with evidence-based resolution), "
            "Statistical Checks, Methodology Audit, Limitations, Conclusion.\n"
            "In the Conclusion, include:\n"
            "- Result quality rating (1-10)\n"
            "- Key findings (3-5)\n"
            "- Methodology gaps to address next\n"
            "- Recommendation: PROCEED / REFINE / PIVOT\n\n"
            "Run context:\n{context}"
        ),
        "max_tokens": 8192,
    },
    "research_decision": {
        "system": "You are a research program lead making go/no-go decisions.",
        "user": (
            "Based on the analysis, make one of three decisions:\n"
            "- **PROCEED** — results are sufficient, move to paper writing\n"
            "- **PIVOT** — hypotheses are fundamentally flawed, generate new ones\n"
            "- **REFINE** — hypotheses are sound but experiments need re-tuning\n\n"
            "MINIMUM QUALITY CRITERIA for PROCEED (ALL must be met):\n"
            "1. At least 2 baselines AND the proposed method have results\n"
            "2. The primary metric is defined (direction, units known)\n"
            "3. Each condition has results from ≥3 seeds\n"
            "4. No identical per-seed values across different conditions (ablation integrity)\n"
            "5. The analysis quality rating is ≥4/10\n"
            "If ANY criterion is not met, you MUST choose REFINE (not PROCEED).\n\n"
            "Output markdown with sections:\n"
            "## Decision\n"
            "State exactly one of: PROCEED, PIVOT, or REFINE\n\n"
            "## Justification\n"
            "Why this decision is warranted based on evidence.\n\n"
            "## Evidence\n"
            "Key data points supporting the decision.\n\n"
            "## Next Actions\n"
            "Concrete steps for the chosen path.\n\n"
            "Analysis:\n{analysis}"
        ),
    },
    # ── Phase G: Paper Writing ───────────────────────────────────────────
    "paper_outline": {
        "system": "You are an academic writing planner for top-tier AI conferences.",
        "user": (
            "{preamble}\n\n"
            "{academic_style_guide}\n\n"
            "Create a detailed paper outline in markdown.\n"
            "Include per-section goals, word count targets, and evidence links.\n"
            "The outline MUST include a catchy method name (2-5 chars) for the paper title.\n"
            "Propose 3 candidate titles following the 'MethodName: Subtitle' format "
            "(each <= 14 words). Rate each on memorability (1-5), specificity (1-5), "
            "and novelty signal (1-5).\n"
            "{topic_constraint}"
            "{feedback}"
            "Analysis:\n{analysis}\n\nDecision:\n{decision}"
        ),
        "max_tokens": 8192,
    },
    "paper_draft": {
        "system": (
            "You are a top-tier academic paper author writing for leading venues.\n\n"
            "KEY PRINCIPLES (from accepted paper analyses):\n"
            "1. NOVELTY: A good paper has 1-2 key ideas and keeps the rest simple.\n"
            "2. NARRATIVE: A short, rigorous, evidence-based technical story with a takeaway.\n"
            "3. STRONG BASELINES: Invest real effort in making baselines competitive.\n"
            "4. ABLATIONS: Remove one component at a time and measure the effect.\n"
            "5. HONESTY: Acknowledge limitations explicitly.\n"
            "6. REPRODUCIBILITY: Include all details needed to reproduce results.\n\n"
            "EVIDENCE-BOUNDING RULES (CRITICAL — violation = reject):\n"
            "7. EVERY claim in the title, abstract, and conclusion MUST be directly "
            "supported by specific experimental metrics provided below.\n"
            "8. If the experiment only covers partial conditions, the title MUST NOT "
            "make global causal claims. Use 'Toward...', 'Investigating...', or "
            "'An Empirical Study of...' instead of 'X Dominates Y'.\n"
            "9. BEFORE writing the title, list the conditions actually tested and "
            "their metric values. The title must only claim what those numbers show.\n"
            "10. If a metric is a single scalar without condition labels, do NOT "
            "claim comparative results between strategies/methods.\n"
            "11. Distinguish between 'we propose and validate' (has full results) vs "
            "'we propose and present preliminary evidence' (partial results).\n\n"
            "You ONLY use real experimental data — never fabricate or approximate numbers.\n\n"
            "METHOD SECTION REQUIREMENTS:\n"
            "12. The Method section MUST include ALL implementation details needed "
            "for reproduction: algorithm pseudocode or step-by-step description, "
            "hyperparameters (learning rate, clipping, discount factor, etc.), "
            "state/observation representation, reward definition, and baseline "
            "configurations.\n"
            "13. For learning-based methods: specify model architecture, training procedure "
            "(iterations, epochs, batch handling), and any stability "
            "mechanisms (regularization, normalization).\n"
            "14. For baselines: specify the exact algorithm/method configuration "
            "and any tuning performed to make baselines competitive.\n\n"
            "FAILURE-AWARE REPORTING REQUIREMENTS:\n"
            "15. If any method has a success rate < 100%, the Results section "
            "MUST report success rates per method and explain inclusion/exclusion "
            "criteria.\n"
            "16. Report BOTH conditional metrics (successful runs only) AND "
            "unconditional metrics (treating failures as worst-case). Without "
            "both, comparative claims are biased by survivorship.\n"
            "17. The Limitations section MUST discuss stability/reliability "
            "if any method showed NaN/divergence/crashes.\n\n"
            "BENCHMARK & ENVIRONMENT SPECIFICATION:\n"
            "18. The Experiments section MUST fully specify the evaluation "
            "environment: state/observation space, action space, hypothesis space, "
            "noise model, episode length, and any randomization procedures.\n"
            "19. Report results PER REGIME (e.g., per noise level, per problem "
            "size) with separate tables or sub-sections. Aggregated-only results "
            "cannot support claims about robustness or generality.\n"
            "20. Include a table comparing all methods across all regimes with "
            "paired statistical tests (bootstrap CI of paired differences, or "
            "paired t-test p-values). Without this, comparative claims lack "
            "statistical grounding.\n\n"
            "METHOD NAMING RULES:\n"
            "21. NEVER use generic labels like 'baseline_1', 'method_variant_1', "
            "'method_variant_2' in the paper. Use descriptive algorithm/method names "
            "that reflect what the method actually does. Generic labels make the paper "
            "scientifically uninterpretable.\n"
            "22. Each method MUST have a full description: architecture, "
            "training procedure, key hyperparameters, and implementation details. "
            "A reader should be able to reimplement every method from the paper alone.\n\n"
            "STATISTICAL REPORTING (MANDATORY for acceptance):\n"
            "23. EVERY result table MUST include 95% confidence intervals "
            "(mean +/- CI or [low, high]).\n"
            "24. EVERY comparison claim ('A outperforms B') MUST cite p-value. "
            "If p >= 0.05, write: 'The difference is not statistically significant.'\n"
            "25. If the proposed method does NOT statistically significantly "
            "outperform a baseline, do NOT claim superiority. Reframe as "
            "'comparable', 'competitive', or 'negative result'.\n\n"
            "WRITING STYLE RULES:\n"
            "26. DO NOT repeat disclaimers like 'due to computational constraints, "
            "this analysis was not conducted' more than once. State each limitation "
            "ONCE in the Limitations section.\n"
            "27. The Limitations section should be concise (200-400 words) listing "
            "3-5 key limitations. Do NOT scatter limitation disclaimers throughout "
            "every section.\n"
            "28. Focus 80% of the paper on WHAT YOU DID and WHAT YOU FOUND, not "
            "on what you could not do. Positive scientific contribution should "
            "dominate the paper.\n"
            "29. Cite 25-40 unique references in the paper body. The Related Work "
            "section alone should cite at least 15 references. Cite only directly "
            "relevant work — do NOT pad with tangentially related papers.\n"
            "30. CITE ORIGINAL PAPERS: When discussing a technique (e.g., Batch "
            "Normalization, ResNet, Adam, PPO), ALWAYS cite the original paper that "
            "introduced it. Do NOT cite a survey or follow-up instead of the original. "
            "The available references list includes foundational papers — use them.\n"
            "31. BASELINE MODERNITY: When discussing baselines and comparisons, ensure "
            "the paper acknowledges whether the baselines represent current practice. "
            "If baselines are older methods, explicitly discuss why they were chosen "
            "and acknowledge stronger modern alternatives exist."
        ),
        "user": (
            "{preamble}\n\n"
            "{academic_style_guide}\n"
            "{narrative_writing_rules}\n"
            "{anti_hedging_rules}\n"
            "{anti_repetition_rules}\n"
            "Write a full paper draft section by section in markdown.\n"
            "Required sections: Title, Abstract, Introduction, Related Work, "
            "Method, Experiments, Results, Discussion, Limitations, Broader Impact, "
            "Conclusion, References.\n"
            "The Broader Impact section (2-3 paragraphs) MUST discuss: "
            "(1) potential positive societal impacts of this work, "
            "(2) potential negative societal impacts or risks, "
            "(3) ethical considerations specific to this research area. "
            "This section is MANDATORY for top ML venues and recommended for all research papers.\n"
            "{writing_structure}\n"
            "{topic_constraint}"
            "{exp_metrics_instruction}"
            "{citation_instruction}"
            "All experimental results MUST be presented in LaTeX tables or inline prose. "
            "Raw metric path formats like 'method/env/step/metric: value' are FORBIDDEN "
            "in the paper text. Convert all data to clean, formatted presentation.\n"
            "The paper MUST fit within 10 pages (excluding references and appendix). "
            "Aim for 8-9 pages of main content. Be concise.\n"
            "FIGURE RULES: When referencing figures, use ONLY \\ref{fig:label} cross-references. "
            "NEVER add bold standalone paragraphs like '**Figure 1.**' after figure environments. "
            "Do NOT add \\clearpage before or after figures/tables unless absolutely necessary.\n"
            "TABLE RULES: Tables MUST use standard LaTeX tabular syntax with bare braces: "
            "\\begin{tabular}{lcc}, NOT \\begin{tabular}\\{lcc\\}. "
            "NEVER use '--' as placeholder values in table cells. "
            "If a metric is unavailable, write 'N/A' or omit the row entirely.\n"
            "Outline:\n{outline}"
        ),
        "max_tokens": 16384,
    },
    "peer_review": {
        "system": "You are a balanced conference reviewer.",
        "user": (
            "Simulate peer review from at least 3 reviewer perspectives.\n"
            "Output markdown with Reviewer A (methodology expert), "
            "Reviewer B (domain expert), and Reviewer C (statistics/rigor expert), "
            "each including strengths, weaknesses, and actionable revisions.\n\n"
            "Check specifically:\n"
            "1. TOPIC ALIGNMENT: Does the paper stay on topic ({topic})? "
            "Flag any sections where the paper drifts to unrelated topics or "
            "presents environment issues as contributions.\n"
            "2. CLAIM-EVIDENCE ALIGNMENT: For EACH claim in the title, abstract, "
            "and conclusion, verify there is a specific metric/table/figure in "
            "the Results section supporting it. Flag unsupported claims.\n"
            "3. STATISTICAL VALIDITY: Are confidence intervals or error bars "
            "reported? Is n>1 (multiple seeds)? Are significance tests appropriate?\n"
            "4. COMPLETENESS: Does the paper have all required sections with "
            "sufficient depth? A NeurIPS paper body should be 5,000-6,500 words.\n"
            "5. REPRODUCIBILITY: Are hyperparameters, random seeds, compute "
            "resources, and dataset details fully specified?\n"
            "6. WRITING QUALITY: Is the paper written in flowing prose or bullet lists? "
            "Flag any bullet-point lists in Method/Results/Discussion. Check for "
            "excessive hedging ('we do not claim'). Verify title is <= 14 words.\n"
            "7. FIGURES: Does the paper include at least 2 figures? Zero figures = desk reject.\n"
            "8. CITATION DISTRIBUTION: Are citations only in Intro/Related Work? "
            "Method, Experiments, and Discussion MUST also cite relevant papers.\n\n"
            "Paper draft:\n{draft}\n\n"
            "Experiment evidence for verification:\n{experiment_evidence}"
        ),
        "max_tokens": 8192,
    },
    "paper_revision": {
        "system": (
            "You are a paper revision expert.\n\n"
            "TITLE AND ABSTRACT ALIGNMENT (CRITICAL):\n"
            "- After reviewing experimental evidence, UPDATE the title if results "
            "do not support the original claim.\n"
            "- If the proposed method does NOT beat baselines, use a title like "
            "'An Empirical Study of...', 'When X Falls Short: ...', or "
            "'Investigating ... : Negative Results and Insights'.\n"
            "- Rewrite the abstract to accurately reflect what was FOUND, not "
            "what was hoped. The abstract must match actual numbers.\n"
            "- The conclusion MUST match actual results — no aspirational claims.\n\n"
            "IMPORTANT WRITING RULES:\n"
            "- Do NOT add disclaimers like 'due to computational constraints' "
            "or 'this analysis was not conducted'. If a limitation exists, "
            "mention it ONCE in the Limitations section only.\n"
            "- Focus 80% of the paper on what was DONE and what was FOUND.\n"
            "- Do NOT add hedging language that was not in the original draft.\n"
            "- Keep Limitations to 200-400 words with 3-5 concise points.\n"
            "- Ensure every comparison claim cites a p-value or states that "
            "the difference is not statistically significant.\n"
        ),
        "user": (
            "{academic_style_guide}\n"
            "{narrative_writing_rules}\n"
            "{anti_hedging_rules}\n"
            "{anti_repetition_rules}\n"
            "Revise the paper draft to address all review comments.\n"
            "Return revised markdown only.\n\n"
            "CRITICAL REVISION RULES:\n"
            "- Transform any remaining bullet-point lists in the body into flowing "
            "prose paragraphs. The only allowed lists are in the Introduction's contribution "
            "paragraph and the Limitations section.\n"
            "- The title MUST be <= 14 words with a catchy method name.\n"
            "- MANDATORY: The revised paper MUST contain at least 2 markdown image references\n"
            "  (![Caption](charts/...)). If the draft has zero figures, ADD them in the Results\n"
            "  section using the chart files. A paper with zero figures will be desk-rejected.\n"
            "- Consolidate ALL hedging/caveats into Limitations section only.\n"
            "- The final paper body MUST be <= 6,500 words (standard 9-page conference limit).\n"
            "  If the current draft exceeds this, compress by removing redundant restatements.\n"
            "- If the paper exceeds 10 pages, aggressively cut redundant content, "
            "merge similar sections, and tighten prose. Target 8-9 pages of main content.\n"
            "- Do NOT add '**Figure N.**' bold paragraphs after figure environments — "
            "use only \\ref{fig:label} cross-references. Do NOT add \\clearpage "
            "before figures or tables.\n"
            "- NEVER use '--' placeholder values in tables. Replace with actual values or 'N/A'.\n"
            "- CITATION FORMAT (CRITICAL): All citations MUST remain in [cite_key] bracket "
            "format exactly as they appear in the draft, e.g. [smith2024transformer]. "
            "Do NOT convert them to author-year format like [Smith et al., 2024] or "
            "(Smith et al., 2024). The downstream LaTeX converter relies on the "
            "[cite_key] format to generate \\cite{{}} commands. Changing the format "
            "will break all references in the final PDF.\n"
            "- CITATION KEYS (CRITICAL): Do NOT invent or add new citation keys that "
            "are not already present in the draft. If you want to reference additional "
            "prior work, describe it in prose WITHOUT a citation bracket. Every "
            "[cite_key] you write MUST already exist in the bibliography. Adding "
            "hallucinated keys like [smith2020method] creates broken [?] references "
            "in the final PDF.\n"
            "{writing_structure}\n"
            "{topic_constraint}"
            "Draft:\n{draft}\n\nReviews:\n{reviews}"
        ),
        "max_tokens": 16384,
    },
    # ── Phase H: Finalization ────────────────────────────────────────────
    "quality_gate": {
        "system": "You are a final quality gate evaluator.",
        "user": (
            "Evaluate revised paper quality and return JSON.\n"
            "Schema: {score_1_to_10:number, verdict:string, strengths:[...], "
            "weaknesses:[...], required_actions:[...]}.\n"
            "Threshold: {quality_threshold}\n"
            "Paper:\n{revised}"
        ),
        "json_mode": True,
    },
    "knowledge_archive": {
        "system": "You produce reproducibility-focused research retrospectives.",
        "user": (
            "{preamble}\n\n"
            "Write retrospective archive markdown with lessons, "
            "reproducibility notes, and future work.\n"
            "Decision:\n{decision}\n\nAnalysis:\n{analysis}\n\n"
            "Revised paper:\n{revised}"
        ),
        "max_tokens": 8192,
    },
    "export_publish": {
        "system": "You are a publication formatting editor.",
        "user": (
            "Format revised paper into clean final markdown for publication "
            "export.\n"
            "Preserve content quality and readability.\n"
            "CITATION FORMAT (CRITICAL): All citations MUST remain in [cite_key] bracket "
            "format, e.g. [smith2024transformer]. Do NOT convert to author-year "
            "format like [Smith et al., 2024]. The [cite_key] format is required "
            "for downstream LaTeX \\cite{{}} generation.\n"
            "Input paper:\n{revised}"
        ),
        "max_tokens": 16384,
    },
}


================================================
FILE: researchclaw/quality.py
================================================
"""Content quality assessment — template detection and metrics.

Detects placeholder/template content in LLM-generated text and provides
quality metrics for pipeline outputs.
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass

logger = logging.getLogger(__name__)

_TEMPLATE_PATTERNS: list[tuple[str, str]] = [
    (
        r"(?i)template\s+(abstract|introduction|method|methodology|conclusion|discussion|results|related\s+work)",
        "Template section header",
    ),
    (r"(?i)\[INSERT\s+.*?\]", "Insert placeholder"),
    (r"(?i)\[TODO\s*:?\s*.*?\]", "TODO placeholder"),
    (r"(?i)\[PLACEHOLDER\s*:?\s*.*?\]", "Explicit placeholder"),
    (r"(?i)lorem\s+ipsum", "Lorem ipsum filler"),
    (
        r"(?i)this\s+section\s+will\s+(describe|discuss|present|outline|explain)",
        "Future-tense placeholder",
    ),
    (
        r"(?i)we\s+will\s+(describe|discuss|present|outline|explain)\s+in\s+this\s+section",
        "Future-tense placeholder",
    ),
    (
        r"(?i)add\s+(your|the)\s+(content|text|description)\s+here",
        "Add content placeholder",
    ),
    (r"(?i)replace\s+this\s+(text|content|section)", "Replace placeholder"),
    (r"(?i)^#+\s*section\s+\d+\s*$", "Generic section header"),
    (
        r"(?i)your\s+(abstract|introduction|method|results)\s+goes?\s+here",
        "Content placeholder",
    ),
    (r"(?i)sample\s+(abstract|introduction|text|content)", "Sample content marker"),
]


@dataclass(frozen=True)
class TemplateMatch:
    """A single template/placeholder detection."""

    pattern_desc: str
    line_number: int
    excerpt: str


@dataclass(frozen=True)
class QualityReport:
    """Quality assessment for a text document."""

    total_lines: int
    total_chars: int
    template_matches: tuple[TemplateMatch, ...] = ()
    template_ratio: float = 0.0

    @property
    def has_template_content(self) -> bool:
        return len(self.template_matches) > 0

    @property
    def match_count(self) -> int:
        return len(self.template_matches)

    def to_dict(self) -> dict[str, object]:
        match_rows: list[dict[str, object]] = [
            {
                "pattern": m.pattern_desc,
                "line": m.line_number,
                "excerpt": m.excerpt,
            }
            for m in self.template_matches
        ]
        return {
            "total_lines": self.total_lines,
            "total_chars": self.total_chars,
            "template_matches": match_rows,
            "template_ratio": round(self.template_ratio, 4),
            "has_template_content": self.has_template_content,
            "match_count": self.match_count,
        }


def detect_template_content(text: str) -> list[TemplateMatch]:
    """Scan text for template/placeholder patterns.

    Returns list of TemplateMatch objects for each detected pattern.
    """

    matches: list[TemplateMatch] = []
    lines = text.split("\n")

    for line_num, line in enumerate(lines, start=1):
        stripped = line.strip()
        if not stripped:
            continue
        for pattern, desc in _TEMPLATE_PATTERNS:
            for m in re.finditer(pattern, stripped):
                excerpt = m.group(0)[:100]
                matches.append(
                    TemplateMatch(
                        pattern_desc=desc,
                        line_number=line_num,
                        excerpt=excerpt,
                    )
                )

    return matches


def compute_template_ratio(text: str) -> float:
    """Estimate what fraction of the text is template/placeholder content.

    Returns 0.0 (fully original) to 1.0 (fully template).
    Simple heuristic: count characters in matched lines vs total.
    """

    if not text.strip():
        return 0.0

    lines = text.split("\n")
    total_chars = sum(len(line.strip()) for line in lines if line.strip())
    if total_chars == 0:
        return 0.0

    template_chars = 0
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        for pattern, _ in _TEMPLATE_PATTERNS:
            if re.search(pattern, stripped):
                template_chars += len(stripped)
                break

    return min(template_chars / total_chars, 1.0)


def assess_quality(text: str) -> QualityReport:
    """Full quality assessment of a text document."""

    lines = text.split("\n")
    matches = detect_template_content(text)
    ratio = compute_template_ratio(text)

    report = QualityReport(
        total_lines=len(lines),
        total_chars=len(text),
        template_matches=tuple(matches),
        template_ratio=ratio,
    )
    logger.debug(
        "quality assessed lines=%d chars=%d matches=%d ratio=%.4f",
        report.total_lines,
        report.total_chars,
        report.match_count,
        report.template_ratio,
    )
    return report


def check_strict_quality(text: str, *, threshold: float = 0.05) -> tuple[bool, str]:
    """Check if text passes strict quality gate.

    Returns (passed, message).
    Fails if template_ratio > threshold.
    """

    report = assess_quality(text)

    if report.template_ratio > threshold:
        details = "; ".join(
            f"L{m.line_number}: {m.excerpt}" for m in report.template_matches[:5]
        )
        return False, (
            f"Template content detected: ratio={report.template_ratio:.2%}, "
            f"{report.match_count} matches. Examples: {details}"
        )

    return True, f"Quality check passed: template_ratio={report.template_ratio:.2%}"


================================================
FILE: researchclaw/report.py
================================================
"""Generate human-readable run reports from pipeline artifacts."""

# pyright: basic
from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


def generate_report(run_dir: Path) -> str:
    """Generate a Markdown report from a pipeline run directory.

    Args:
        run_dir: Path to the run artifacts directory (e.g., artifacts/rc-xxx/)

    Returns:
        Markdown string with the report content.

    Raises:
        FileNotFoundError: If run_dir doesn't exist.
        ValueError: If run_dir has no pipeline_summary.json.
    """
    if not run_dir.exists():
        raise FileNotFoundError(f"Run directory not found: {run_dir}")

    summary_path = run_dir / "pipeline_summary.json"
    if not summary_path.exists():
        raise ValueError(f"No pipeline_summary.json found in {run_dir}")

    loaded = json.loads(summary_path.read_text(encoding="utf-8"))
    summary = loaded if isinstance(loaded, dict) else {}

    sections = []
    sections.append(_header(summary, run_dir))
    sections.append(_paper_section(run_dir))
    sections.append(_experiment_section(run_dir))
    sections.append(_citation_section(run_dir))
    sections.append(_warnings_section(summary))

    return "\n\n".join(section for section in sections if section)


def _header(summary: dict[str, Any], run_dir: Path) -> str:
    run_id = summary.get("run_id", "unknown")
    stages_done = summary.get("stages_done", 0)
    stages_total = summary.get("stages_executed", 0)
    status = summary.get("final_status", "unknown")
    generated = summary.get("generated", "unknown")

    status_icon = "✅" if status == "done" else "❌" if status == "failed" else "⚠️"

    lines = [
        "# ResearchClaw Run Report",
        "",
        f"**Run ID**: {run_id}",
        f"**Date**: {generated}",
        f"**Status**: {status_icon} {status} ({stages_done}/{stages_total} stages done)",
        f"**Artifacts**: `{run_dir}`",
    ]
    return "\n".join(lines)


def _paper_section(run_dir: Path) -> str:
    lines = ["## Paper"]

    draft_path = run_dir / "stage-17" / "paper_draft.md"
    if draft_path.exists():
        text = draft_path.read_text(encoding="utf-8")
        word_count = len(text.split())
        lines.append(
            f"- Draft: `{draft_path.relative_to(run_dir)}` (~{word_count} words)"
        )
    else:
        lines.append("- Draft: not generated")

    final_path = run_dir / "stage-22" / "paper_final.md"
    if final_path.exists():
        lines.append(f"- Final: `{final_path.relative_to(run_dir)}`")

    tex_path = run_dir / "stage-22" / "paper.tex"
    if tex_path.exists():
        lines.append(f"- LaTeX: `{tex_path.relative_to(run_dir)}`")

    rev_path = run_dir / "stage-19" / "paper_revised.md"
    if rev_path.exists():
        lines.append(f"- Revised: `{rev_path.relative_to(run_dir)}`")

    return "\n".join(lines)


def _experiment_section(run_dir: Path) -> str:
    lines = ["## Experiments"]

    code_path = run_dir / "stage-10" / "experiment_code.py"
    if code_path.exists():
        lines.append(f"- Code: `{code_path.relative_to(run_dir)}`")

    results_path = run_dir / "stage-12" / "experiment_results.json"
    if results_path.exists():
        try:
            loaded = json.loads(results_path.read_text(encoding="utf-8"))
            if isinstance(loaded, dict):
                data = loaded
                runs_default: list[Any] = []
                iterations = data.get("iterations", data.get("runs", runs_default))
                if isinstance(iterations, list):
                    lines.append(f"- Runs: {len(iterations)} iterations")
                best = data.get("best_metric") or data.get("best_result")
                if best is not None:
                    lines.append(f"- Best metric: {best}")
        except (json.JSONDecodeError, TypeError):
            lines.append("- Results: present (parse error)")
    else:
        lines.append("- Results: not available")

    # BUG-215: Also search stage-14* versioned dirs when stage-14/ is missing.
    analysis_path = run_dir / "stage-14" / "analysis.md"
    if not analysis_path.exists():
        for _s14 in sorted(run_dir.glob("stage-14*"), reverse=True):
            _alt = _s14 / "analysis.md"
            if _alt.exists():
                analysis_path = _alt
                break
    if analysis_path.exists():
        lines.append(f"- Analysis: `{analysis_path.relative_to(run_dir)}`")

    return "\n".join(lines)


def _citation_section(run_dir: Path) -> str:
    lines = ["## Citations"]

    bib_path = run_dir / "stage-22" / "references.bib"
    if not bib_path.exists():
        bib_path = run_dir / "stage-04" / "references.bib"

    if bib_path.exists():
        text = bib_path.read_text(encoding="utf-8")
        entries = re.findall(r"@\w+\{", text)
        lines.append(f"- References: {len(entries)} BibTeX entries")
    else:
        lines.append("- References: not available")

    verify_path = run_dir / "stage-23" / "verification_report.json"
    if verify_path.exists():
        try:
            loaded = json.loads(verify_path.read_text(encoding="utf-8"))
            vdata = loaded if isinstance(loaded, dict) else {}
            total = int(vdata.get("total_references", 0))
            verified = int(vdata.get("verified_count", 0))
            suspicious = int(vdata.get("suspicious_count", 0))
            hallucinated = int(vdata.get("hallucinated_count", 0))
            pct = f"{verified / total * 100:.1f}%" if total > 0 else "N/A"
            lines.append(f"- Verified: {verified}/{total} ({pct})")
            if suspicious:
                lines.append(f"- Suspicious: {suspicious}")
            if hallucinated:
                lines.append(f"- Hallucinated: {hallucinated}")
        except (json.JSONDecodeError, TypeError, ZeroDivisionError):
            lines.append("- Verification: present (parse error)")
    else:
        lines.append("- Verification: not run")

    return "\n".join(lines)


def _warnings_section(summary: dict[str, Any]) -> str:
    warnings: list[str] = []

    stages_failed = summary.get("stages_failed", 0)
    if stages_failed:
        warnings.append(f"- ⚠️ {stages_failed} stage(s) failed during execution")

    content_metrics = summary.get("content_metrics", {})
    if isinstance(content_metrics, dict):
        template_ratio = content_metrics.get("template_ratio")
        if isinstance(template_ratio, (int, float)) and template_ratio > 0.1:
            warnings.append(
                f"- ⚠️ Template content detected: {template_ratio:.1%} of paper may be template text"
            )

        degraded = content_metrics.get("degraded_sources", [])
        if isinstance(degraded, list) and degraded:
            warnings.append(f"- ⚠️ Degraded sources: {', '.join(degraded)}")

    if not warnings:
        return ""

    return "## Warnings\n" + "\n".join(warnings)


def print_report(run_dir: Path) -> None:
    print(generate_report(run_dir))


def write_report(run_dir: Path, output_path: Path) -> None:
    report = generate_report(run_dir)
    _ = output_path.write_text(report, encoding="utf-8")


================================================
FILE: researchclaw/server/__init__.py
================================================
"""ResearchClaw Web server package."""


================================================
FILE: researchclaw/server/app.py
================================================
"""FastAPI application factory."""

from __future__ import annotations

import asyncio
import logging
from pathlib import Path
from typing import Any

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles

from researchclaw.config import RCConfig
from researchclaw.server.middleware.auth import TokenAuthMiddleware
from researchclaw.server.websocket.manager import ConnectionManager
from researchclaw.server.websocket.events import Event, EventType

logger = logging.getLogger(__name__)

# Shared application state accessible by routes
_app_state: dict[str, Any] = {}


def create_app(
    config: RCConfig,
    *,
    dashboard_only: bool = False,
    monitor_dir: str | None = None,
) -> FastAPI:
    """Create and configure the FastAPI application.

    Args:
        config: ResearchClaw configuration.
        dashboard_only: If True, only mount dashboard routes.
        monitor_dir: Specific run directory to monitor.
    """
    app = FastAPI(
        title="ResearchClaw",
        description="Autonomous Research Pipeline — Web Interface",
        version="0.5.0",
    )

    # Store config in shared state
    _app_state["config"] = config
    _app_state["monitor_dir"] = monitor_dir

    # --- CORS ---
    app.add_middleware(
        CORSMiddleware,
        allow_origins=list(config.server.cors_origins),
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )

    # --- Token auth ---
    if config.server.auth_token:
        app.add_middleware(TokenAuthMiddleware, token=config.server.auth_token)

    # --- WebSocket manager ---
    event_manager = ConnectionManager()
    _app_state["event_manager"] = event_manager

    # --- Health endpoint ---
    @app.get("/api/health")
    async def health() -> dict[str, Any]:
        return {
            "status": "ok",
            "version": "0.5.0",
            "active_connections": event_manager.active_count,
        }

    @app.get("/api/config")
    async def config_summary() -> dict[str, Any]:
        return {
            "project": config.project.name,
            "topic": config.research.topic,
            "mode": config.experiment.mode,
            "server": {
                "voice_enabled": config.server.voice_enabled,
                "dashboard_enabled": config.dashboard.enabled,
            },
        }

    # --- Routes ---
    from researchclaw.server.routes.pipeline import router as pipeline_router
    from researchclaw.server.routes.projects import router as projects_router

    app.include_router(pipeline_router)
    app.include_router(projects_router)

    if not dashboard_only:
        from researchclaw.server.routes.chat import router as chat_router, set_chat_manager

        set_chat_manager(event_manager)
        app.include_router(chat_router)

        if config.server.voice_enabled:
            from researchclaw.server.routes.voice import router as voice_router

            app.include_router(voice_router)

    # --- WebSocket events endpoint ---
    from fastapi import WebSocket, WebSocketDisconnect
    import uuid

    @app.websocket("/ws/events")
    async def events_ws(websocket: WebSocket) -> None:
        """Real-time event stream for dashboard."""
        client_id = f"evt-{uuid.uuid4().hex[:8]}"
        await event_manager.connect(websocket, client_id)
        try:
            while True:
                # Keep connection alive; client can send pings
                await websocket.receive_text()
        except WebSocketDisconnect:
            event_manager.disconnect(client_id)

    # --- Static files (frontend) ---
    frontend_dir = Path(__file__).resolve().parent.parent.parent / "frontend"
    if frontend_dir.is_dir():
        app.mount("/static", StaticFiles(directory=str(frontend_dir)), name="static")

        # Serve index.html at root
        from fastapi.responses import FileResponse

        @app.get("/")
        async def index() -> FileResponse:
            return FileResponse(str(frontend_dir / "index.html"))

    # --- Background tasks ---
    @app.on_event("startup")
    async def startup() -> None:
        asyncio.create_task(event_manager.heartbeat_loop(interval=15.0))

        if config.dashboard.enabled:
            from researchclaw.dashboard.broadcaster import start_dashboard_loop

            asyncio.create_task(
                start_dashboard_loop(
                    event_manager,
                    interval=config.dashboard.refresh_interval_sec,
                    monitor_dir=monitor_dir,
                )
            )
        logger.info("ResearchClaw Web server started")

    return app


================================================
FILE: researchclaw/server/dialog/__init__.py
================================================
"""Dialog / conversational research modules."""


================================================
FILE: researchclaw/server/dialog/intents.py
================================================
"""Intent classification for conversational research."""

from __future__ import annotations

import re
from enum import Enum
from typing import Any


class Intent(str, Enum):
    """Research chat intents."""

    TOPIC_SELECTION = "topic_selection"
    START_PIPELINE = "start_pipeline"
    CHECK_STATUS = "check_status"
    MODIFY_CONFIG = "modify_config"
    DISCUSS_RESULTS = "discuss_results"
    EDIT_PAPER = "edit_paper"
    GENERAL_CHAT = "general_chat"
    HELP = "help"


# Keyword patterns for fast classification
_INTENT_PATTERNS: list[tuple[Intent, re.Pattern[str]]] = [
    (Intent.HELP, re.compile(
        r"(?:^\s*help\s*$|\bhow\s+to\b|\busage\b|帮助|怎么用)", re.IGNORECASE
    )),
    (Intent.START_PIPELINE, re.compile(
        r"(?:\b(?:start|run|begin|launch)\b|开始|启动|跑|运行)",
        re.IGNORECASE,
    )),
    (Intent.CHECK_STATUS, re.compile(
        r"(?:\b(?:status|progress|stage|current)\b|阶段|进度|到哪|第几|哪一步)", re.IGNORECASE
    )),
    (Intent.TOPIC_SELECTION, re.compile(
        r"(?:\b(?:topic|idea|direction)\b|research\s+direction|研究方向|选题|研究主题|想法)",
        re.IGNORECASE,
    )),
    (Intent.MODIFY_CONFIG, re.compile(
        r"(?:\b(?:config|setting|parameter|batch|epoch)\b|learning\s+rate|学习率|修改|设置)",
        re.IGNORECASE,
    )),
    (Intent.DISCUSS_RESULTS, re.compile(
        r"(?:\b(?:results?|metrics?|accuracy|loss|performance)\b|结果|指标|效果|怎么样)",
        re.IGNORECASE,
    )),
    (Intent.EDIT_PAPER, re.compile(
        r"(?:\b(?:paper|abstract|introduction|draft)\b|论文|摘要|改一下|写)",
        re.IGNORECASE,
    )),
]


def classify_intent(message: str) -> tuple[Intent, float]:
    """Classify user intent from message text.

    Returns (intent, confidence) where confidence is 0-1.
    Uses keyword matching for speed; can be replaced with LLM.
    """
    message_lower = message.strip().lower()

    if not message_lower:
        return Intent.GENERAL_CHAT, 0.0

    for intent, pattern in _INTENT_PATTERNS:
        if pattern.search(message_lower):
            return intent, 0.8

    return Intent.GENERAL_CHAT, 0.5


================================================
FILE: researchclaw/server/dialog/router.py
================================================
"""Dialog router — routes messages to appropriate handlers."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any

from researchclaw.server.dialog.intents import Intent, classify_intent
from researchclaw.server.dialog.session import ChatSession, SessionManager

logger = logging.getLogger(__name__)

_session_manager = SessionManager()


async def route_message(raw_message: str, client_id: str) -> str:
    """Route incoming chat message and return response."""
    # Parse message (could be plain text or JSON)
    try:
        msg_data = json.loads(raw_message)
        text = msg_data.get("message", msg_data.get("text", raw_message))
    except (json.JSONDecodeError, TypeError):
        text = raw_message

    session = _session_manager.get_or_create(client_id)
    session.add_message("user", text)

    intent, confidence = classify_intent(text)
    logger.debug("Intent: %s (%.2f) for: %s", intent.value, confidence, text[:50])

    handler = _HANDLERS.get(intent, _handle_general)
    response = await handler(text, session)

    session.add_message("assistant", response)
    return response


async def _handle_help(text: str, session: ChatSession) -> str:
    return (
        "I can help you with:\n"
        "- **Select a research topic**: describe your area of interest\n"
        "- **Start a pipeline run**: say 'start experiment' or 'run pipeline'\n"
        "- **Check progress**: ask 'what stage are we at?'\n"
        "- **View results**: ask about metrics, accuracy, or results\n"
        "- **Modify settings**: change learning rate, epochs, etc.\n"
        "- **Edit paper**: suggest changes to abstract, introduction, etc.\n\n"
        "Just type naturally — I'll figure out what you need!"
    )


async def _handle_status(text: str, session: ChatSession) -> str:
    from researchclaw.dashboard.collector import DashboardCollector

    collector = DashboardCollector()
    runs = collector.collect_all()
    if not runs:
        return "No pipeline runs found. Start one with 'start pipeline'."

    active = [r for r in runs if r.is_active]
    if active:
        r = active[0]
        return (
            f"**Active run**: {r.run_id}\n"
            f"- Stage: {r.current_stage}/23 ({r.current_stage_name})\n"
            f"- Status: {r.status}\n"
            f"- Topic: {r.topic or '(not set)'}"
        )

    latest = runs[0]
    return (
        f"**Latest run**: {latest.run_id}\n"
        f"- Stage: {latest.current_stage}/23\n"
        f"- Status: {latest.status}\n"
        f"- Stages completed: {len(latest.stages_completed)}"
    )


async def _handle_start(text: str, session: ChatSession) -> str:
    return (
        "To start a pipeline run, use the dashboard or API:\n"
        "```\n"
        "POST /api/pipeline/start\n"
        '{"topic": "your research topic", "auto_approve": true}\n'
        "```\n"
        "Or run from CLI: `researchclaw run -c config.yaml`\n\n"
        "Would you like me to help you set up the configuration?"
    )


async def _handle_topic(text: str, session: ChatSession) -> str:
    return (
        "Let me help you find a research direction!\n\n"
        "Please tell me:\n"
        "1. Your research **domain** (e.g., CV, NLP, RL, AI4Science)\n"
        "2. Any **specific interests** (e.g., robustness, efficiency, fairness)\n"
        "3. Your **target venue** (e.g., NeurIPS, ICML, ICLR)\n\n"
        "I'll suggest novel, timely research angles based on recent trends."
    )


async def _handle_config(text: str, session: ChatSession) -> str:
    return (
        "You can modify the configuration through:\n"
        "1. Edit `config.yaml` directly\n"
        "2. Use the wizard: `researchclaw wizard`\n"
        "3. Pass overrides when starting: "
        '`POST /api/pipeline/start {"config_overrides": {...}}`\n\n'
        "What setting would you like to change?"
    )


async def _handle_results(text: str, session: ChatSession) -> str:
    from researchclaw.dashboard.collector import DashboardCollector

    collector = DashboardCollector()
    runs = collector.collect_all()
    if not runs:
        return "No runs found yet. Start a pipeline first."

    latest = runs[0]
    if not latest.metrics:
        return f"Run {latest.run_id} has no metrics yet (stage {latest.current_stage}/23)."

    lines = [f"**Results for {latest.run_id}**:\n"]
    for key, value in latest.metrics.items():
        if isinstance(value, (int, float)):
            lines.append(f"- {key}: {value}")
    return "\n".join(lines) if len(lines) > 1 else f"Metrics: {latest.metrics}"


async def _handle_paper(text: str, session: ChatSession) -> str:
    return (
        "Paper editing is available after Stage 17 (Paper Draft).\n\n"
        "I can help with:\n"
        "- Review and suggest improvements to the abstract\n"
        "- Check the introduction structure\n"
        "- Verify experiment descriptions match actual results\n"
        "- Improve related work coverage\n\n"
        "Which section would you like to work on?"
    )


async def _handle_general(text: str, session: ChatSession) -> str:
    return (
        "I'm your ResearchClaw assistant. I can help with:\n"
        "- Selecting research topics\n"
        "- Running experiments\n"
        "- Monitoring progress\n"
        "- Analyzing results\n"
        "- Editing papers\n\n"
        "What would you like to do?"
    )


_HANDLERS = {
    Intent.HELP: _handle_help,
    Intent.CHECK_STATUS: _handle_status,
    Intent.START_PIPELINE: _handle_start,
    Intent.TOPIC_SELECTION: _handle_topic,
    Intent.MODIFY_CONFIG: _handle_config,
    Intent.DISCUSS_RESULTS: _handle_results,
    Intent.EDIT_PAPER: _handle_paper,
    Intent.GENERAL_CHAT: _handle_general,
}


================================================
FILE: researchclaw/server/dialog/session.py
================================================
"""Conversation session management."""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class ChatMessage:
    """A single chat message."""

    role: str  # "user" or "assistant"
    content: str
    timestamp: float = field(default_factory=time.time)

    def to_dict(self) -> dict[str, Any]:
        return {"role": self.role, "content": self.content, "timestamp": self.timestamp}


@dataclass
class ChatSession:
    """Per-client chat session state."""

    client_id: str
    history: list[ChatMessage] = field(default_factory=list)
    current_project: str = ""
    current_run: str = ""
    created_at: float = field(default_factory=time.time)

    MAX_HISTORY: int = 50

    def add_message(self, role: str, content: str) -> ChatMessage:
        msg = ChatMessage(role=role, content=content)
        self.history.append(msg)
        # Trim to prevent unbounded growth
        if len(self.history) > self.MAX_HISTORY:
            self.history = self.history[-self.MAX_HISTORY:]
        return msg

    def get_context(self, last_n: int = 10) -> list[dict[str, str]]:
        """Get recent messages for LLM context."""
        return [
            {"role": m.role, "content": m.content}
            for m in self.history[-last_n:]
        ]

    def to_dict(self) -> dict[str, Any]:
        return {
            "client_id": self.client_id,
            "current_project": self.current_project,
            "current_run": self.current_run,
            "history": [m.to_dict() for m in self.history],
            "created_at": self.created_at,
        }


class SessionManager:
    """Manage chat sessions."""

    def __init__(self, persist_dir: str = ".researchclaw/sessions") -> None:
        self._sessions: dict[str, ChatSession] = {}
        self._persist_dir = Path(persist_dir)

    def get_or_create(self, client_id: str) -> ChatSession:
        """Get existing session or create new one."""
        if client_id not in self._sessions:
            self._sessions[client_id] = ChatSession(client_id=client_id)
        return self._sessions[client_id]

    def remove(self, client_id: str) -> None:
        """Remove a session."""
        self._sessions.pop(client_id, None)

    def save(self, client_id: str) -> None:
        """Persist session to disk."""
        session = self._sessions.get(client_id)
        if not session:
            return
        self._persist_dir.mkdir(parents=True, exist_ok=True)
        path = self._persist_dir / f"{client_id}.json"
        try:
            with path.open("w", encoding="utf-8") as f:
                json.dump(session.to_dict(), f, ensure_ascii=False, indent=2)
        except Exception:
            logger.debug("Failed to persist session %s", client_id)

    def load(self, client_id: str) -> ChatSession | None:
        """Load session from disk."""
        path = self._persist_dir / f"{client_id}.json"
        if not path.exists():
            return None
        try:
            with path.open() as f:
                data = json.load(f)
            session = ChatSession(
                client_id=data["client_id"],
                current_project=data.get("current_project", ""),
                current_run=data.get("current_run", ""),
                created_at=data.get("created_at", time.time()),
            )
            for m in data.get("history", []):
                session.history.append(
                    ChatMessage(
                        role=m["role"],
                        content=m["content"],
                        timestamp=m.get("timestamp", 0),
                    )
                )
            self._sessions[client_id] = session
            return session
        except Exception:
            logger.debug("Failed to load session %s", client_id)
            return None


================================================
FILE: researchclaw/server/middleware/__init__.py
================================================
"""Server middleware modules."""


================================================
FILE: researchclaw/server/middleware/auth.py
================================================
"""Basic token authentication middleware."""

from __future__ import annotations

from typing import Callable, Awaitable

from starlette.middleware.base import BaseHTTPMiddleware
from starlette.requests import Request
from starlette.responses import JSONResponse, Response


class TokenAuthMiddleware(BaseHTTPMiddleware):
    """Optional bearer-token authentication.

    If *token* is empty, all requests are allowed (no-op).
    """

    # Paths that never require auth
    EXEMPT_PATHS = frozenset({"/api/health", "/docs", "/openapi.json"})

    def __init__(self, app: object, token: str = "") -> None:
        super().__init__(app)  # type: ignore[arg-type]
        self._token = token

    async def dispatch(
        self,
        request: Request,
        call_next: Callable[[Request], Awaitable[Response]],
    ) -> Response:
        # No-op when token is unset
        if not self._token:
            return await call_next(request)

        # Skip auth for exempt paths and static files
        path = request.url.path
        if path in self.EXEMPT_PATHS or path.startswith("/static"):
            return await call_next(request)

        # WebSocket connections carry token as query param
        if path.startswith("/ws"):
            token = request.query_params.get("token", "")
        else:
            auth_header = request.headers.get("authorization", "")
            token = auth_header.removeprefix("Bearer ").strip()

        if token != self._token:
            return JSONResponse(
                {"detail": "Unauthorized"}, status_code=401
            )

        return await call_next(request)


================================================
FILE: researchclaw/server/routes/__init__.py
================================================
"""API route modules."""


================================================
FILE: researchclaw/server/routes/chat.py
================================================
"""Chat WebSocket endpoint for conversational research."""

from __future__ import annotations

import logging
import uuid

from fastapi import APIRouter, WebSocket, WebSocketDisconnect

from researchclaw.server.websocket.events import Event, EventType
from researchclaw.server.websocket.manager import ConnectionManager

logger = logging.getLogger(__name__)

router = APIRouter(tags=["chat"])

# Global connection manager (initialized by app.py)
_chat_manager: ConnectionManager | None = None


def set_chat_manager(manager: ConnectionManager) -> None:
    """Set the shared connection manager."""
    global _chat_manager
    _chat_manager = manager


def get_chat_manager() -> ConnectionManager:
    """Get the shared connection manager."""
    if _chat_manager is None:
        raise RuntimeError("Chat manager not initialized")
    return _chat_manager


@router.websocket("/ws/chat")
async def chat_websocket(websocket: WebSocket) -> None:
    """WebSocket endpoint for conversational research chat."""
    manager = get_chat_manager()
    client_id = str(uuid.uuid4())[:8]

    await manager.connect(websocket, client_id)
    try:
        while True:
            raw = await websocket.receive_text()
            try:
                from researchclaw.server.dialog.router import route_message

                response = await route_message(raw, client_id)
                await manager.send_to(
                    client_id,
                    Event(
                        type=EventType.CHAT_RESPONSE,
                        data={"message": response, "client_id": client_id},
                    ),
                )
            except Exception as exc:
                logger.exception("Chat error for %s", client_id)
                await manager.send_to(
                    client_id,
                    Event(
                        type=EventType.ERROR,
                        data={"error": str(exc), "client_id": client_id},
                    ),
                )
    except WebSocketDisconnect:
        manager.disconnect(client_id)


================================================
FILE: researchclaw/server/routes/pipeline.py
================================================
"""Pipeline control API routes."""

from __future__ import annotations

import asyncio
import json
import logging
from pathlib import Path
from typing import Any

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

logger = logging.getLogger(__name__)

import re as _re
_RUN_ID_RE = _re.compile(r"^rc-\d{8}-\d{6}-[a-f0-9]+$")


def _validated_run_dir(run_id: str) -> Path:
    """Validate run_id format and return the run directory path."""
    if not _RUN_ID_RE.match(run_id):
        raise HTTPException(status_code=400, detail=f"Invalid run_id format: {run_id}")
    run_dir = Path("artifacts") / run_id
    # Ensure resolved path is under artifacts/
    if not run_dir.resolve().is_relative_to(Path("artifacts").resolve()):
        raise HTTPException(status_code=400, detail=f"Invalid run_id: {run_id}")
    return run_dir

router = APIRouter(prefix="/api", tags=["pipeline"])


class PipelineStartRequest(BaseModel):
    """Request body for starting a pipeline run."""

    topic: str | None = None
    config_overrides: dict[str, Any] | None = None
    auto_approve: bool = True


class PipelineStartResponse(BaseModel):
    """Response after starting a pipeline."""

    run_id: str
    status: str
    output_dir: str


# In-memory tracking of the active run (single-tenant MVP)
_active_run: dict[str, Any] | None = None
_run_task: asyncio.Task[Any] | None = None


def _get_app_state() -> dict[str, Any]:
    """Get shared application state (set by app.py)."""
    from researchclaw.server.app import _app_state
    return _app_state


@router.post("/pipeline/start", response_model=PipelineStartResponse)
async def start_pipeline(req: PipelineStartRequest) -> PipelineStartResponse:
    """Start a new pipeline run."""
    global _active_run, _run_task

    if _active_run and _active_run.get("status") == "running":
        raise HTTPException(status_code=409, detail="A pipeline is already running")

    state = _get_app_state()
    config = state["config"]

    if req.topic:
        import dataclasses
        new_research = dataclasses.replace(config.research, topic=req.topic)
        config = dataclasses.replace(config, research=new_research)

    import hashlib
    from datetime import datetime, timezone

    ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
    topic_hash = hashlib.sha256(config.research.topic.encode()).hexdigest()[:6]
    run_id = f"rc-{ts}-{topic_hash}"
    run_dir = _validated_run_dir(run_id)
    run_dir.mkdir(parents=True, exist_ok=True)

    _active_run = {
        "run_id": run_id,
        "status": "running",
        "output_dir": str(run_dir),
        "topic": config.research.topic,
    }

    async def _run_in_background() -> None:
        global _active_run
        try:
            from researchclaw.adapters import AdapterBundle
            from researchclaw.pipeline.runner import execute_pipeline

            kb_root = Path(config.knowledge_base.root) if config.knowledge_base.root else None
            if kb_root:
                kb_root.mkdir(parents=True, exist_ok=True)

            loop = asyncio.get_event_loop()
            results = await loop.run_in_executor(
                None,
                lambda: execute_pipeline(
                    run_dir=run_dir,
                    run_id=run_id,
                    config=config,
                    adapters=AdapterBundle(),
                    auto_approve_gates=req.auto_approve,
                    skip_noncritical=True,
                    kb_root=kb_root,
                ),
            )
            done = sum(1 for r in results if r.status.value == "done")
            failed = sum(1 for r in results if r.status.value == "failed")
            if _active_run:
                _active_run["status"] = "completed" if failed == 0 else "failed"
                _active_run["stages_done"] = done
                _active_run["stages_failed"] = failed
        except Exception as exc:
            logger.exception("Pipeline run failed")
            if _active_run:
                _active_run["status"] = "failed"
                _active_run["error"] = str(exc)

    _run_task = asyncio.create_task(_run_in_background())

    return PipelineStartResponse(
        run_id=run_id,
        status="running",
        output_dir=str(run_dir),
    )


@router.post("/pipeline/stop")
async def stop_pipeline() -> dict[str, str]:
    """Stop the currently running pipeline."""
    global _active_run, _run_task

    if not _run_task or not _active_run:
        raise HTTPException(status_code=404, detail="No pipeline is running")

    _run_task.cancel()
    _active_run["status"] = "stopped"
    return {"status": "stopped"}


@router.get("/pipeline/status")
async def pipeline_status() -> dict[str, Any]:
    """Get current pipeline run status."""
    if not _active_run:
        return {"status": "idle"}
    return _active_run


@router.get("/pipeline/stages")
async def pipeline_stages() -> dict[str, Any]:
    """Get the 23-stage pipeline definition."""
    from researchclaw.pipeline.stages import Stage

    stages = []
    for s in Stage:
        stages.append({
            "number": int(s),
            "name": s.name,
            "label": getattr(s, "label", s.name.replace("_", " ").title()),
            "phase": getattr(s, "phase", ""),
        })
    return {"stages": stages}


@router.get("/runs")
async def list_runs() -> dict[str, Any]:
    """List historical pipeline runs from artifacts/ directory."""
    artifacts = Path("artifacts")
    runs: list[dict[str, Any]] = []
    if artifacts.exists():
        for d in sorted(artifacts.iterdir(), reverse=True):
            if d.is_dir() and d.name.startswith("rc-"):
                info: dict[str, Any] = {"run_id": d.name, "path": str(d)}
                # Try reading checkpoint
                ckpt = d / "checkpoint.json"
                if ckpt.exists():
                    try:
                        with ckpt.open() as f:
                            info["checkpoint"] = json.load(f)
                    except Exception:
                        pass
                runs.append(info)
    return {"runs": runs[:50]}  # limit to 50 most recent


@router.get("/runs/{run_id}")
async def get_run(run_id: str) -> dict[str, Any]:
    """Get details for a specific run."""
    run_dir = _validated_run_dir(run_id)
    if not run_dir.exists():
        raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")

    info: dict[str, Any] = {"run_id": run_id, "path": str(run_dir)}

    ckpt = run_dir / "checkpoint.json"
    if ckpt.exists():
        try:
            with ckpt.open() as f:
                info["checkpoint"] = json.load(f)
        except Exception:
            pass

    # List stage directories
    stage_dirs = sorted(
        [d.name for d in run_dir.iterdir() if d.is_dir() and d.name.startswith("stage-")]
    )
    info["stages_completed"] = stage_dirs

    # Check for paper
    for pattern in ["paper.md", "paper.tex", "paper.pdf"]:
        found = list(run_dir.rglob(pattern))
        if found:
            info[f"has_{pattern.split('.')[1]}"] = True

    return info


@router.get("/runs/{run_id}/metrics")
async def get_run_metrics(run_id: str) -> dict[str, Any]:
    """Get experiment metrics for a run."""
    run_dir = _validated_run_dir(run_id)
    if not run_dir.exists():
        raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")

    metrics: dict[str, Any] = {}
    results_file = run_dir / "results.json"
    if results_file.exists():
        try:
            with results_file.open() as f:
                metrics = json.load(f)
        except Exception:
            pass

    return {"run_id": run_id, "metrics": metrics}


================================================
FILE: researchclaw/server/routes/projects.py
================================================
"""Project listing / status API routes."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

from fastapi import APIRouter

router = APIRouter(prefix="/api", tags=["projects"])


@router.get("/projects")
async def list_projects() -> dict[str, Any]:
    """List all project directories (artifacts/rc-*)."""
    artifacts = Path("artifacts")
    projects: list[dict[str, Any]] = []
    if artifacts.exists():
        for d in sorted(artifacts.iterdir(), reverse=True):
            if d.is_dir() and d.name.startswith("rc-"):
                proj: dict[str, Any] = {
                    "id": d.name,
                    "path": str(d),
                }
                ckpt = d / "checkpoint.json"
                if ckpt.exists():
                    try:
                        with ckpt.open() as f:
                            ckpt_data = json.load(f)
                        proj["current_stage"] = ckpt_data.get("stage")
                        proj["status"] = ckpt_data.get("status", "unknown")
                    except Exception:
                        proj["status"] = "unknown"
                else:
                    proj["status"] = "no_checkpoint"
                projects.append(proj)
    return {"projects": projects}


================================================
FILE: researchclaw/server/routes/voice.py
================================================
"""Voice upload / transcription API routes."""

from __future__ import annotations

import logging
from typing import Any

from fastapi import APIRouter, HTTPException, UploadFile, File

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/voice", tags=["voice"])


@router.post("/transcribe")
async def transcribe_audio(
    file: UploadFile = File(...),
    language: str = "zh",
) -> dict[str, Any]:
    """Transcribe uploaded audio using Whisper API."""
    try:
        from researchclaw.voice.transcriber import VoiceTranscriber
    except ImportError:
        raise HTTPException(
            status_code=501,
            detail="Voice dependencies not installed. Run: pip install researchclaw[voice]",
        )

    from researchclaw.server.app import _app_state

    config = _app_state.get("config")
    if not config or not config.server.voice_enabled:
        raise HTTPException(status_code=403, detail="Voice is not enabled in config")

    audio_bytes = await file.read()
    transcriber = VoiceTranscriber(config.server)
    text = await transcriber.transcribe(audio_bytes, language=language)

    return {"text": text, "language": language}


================================================
FILE: researchclaw/server/websocket/__init__.py
================================================
"""WebSocket modules."""


================================================
FILE: researchclaw/server/websocket/events.py
================================================
"""WebSocket event type definitions."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import Any
import json
import time


class EventType(str, Enum):
    """All WebSocket event types."""

    # Lifecycle
    CONNECTED = "connected"
    HEARTBEAT = "heartbeat"
    ERROR = "error"

    # Pipeline
    PIPELINE_STARTED = "pipeline_started"
    PIPELINE_COMPLETED = "pipeline_completed"
    STAGE_START = "stage_start"
    STAGE_COMPLETE = "stage_complete"
    STAGE_FAIL = "stage_fail"
    METRIC_UPDATE = "metric_update"
    LOG_LINE = "log_line"
    PAPER_READY = "paper_ready"

    # Chat
    CHAT_RESPONSE = "chat_response"
    CHAT_TYPING = "chat_typing"
    CHAT_SUGGESTION = "chat_suggestion"

    # System
    RUN_DISCOVERED = "run_discovered"
    RUN_STATUS_CHANGED = "run_status_changed"


@dataclass
class Event:
    """A WebSocket event."""

    type: EventType
    data: dict[str, Any] = field(default_factory=dict)
    timestamp: float = field(default_factory=time.time)

    def to_json(self) -> str:
        """Serialize to JSON string."""
        return json.dumps(
            {
                "type": self.type.value,
                "data": self.data,
                "timestamp": self.timestamp,
            }
        )

    @classmethod
    def from_json(cls, raw: str) -> Event:
        """Deserialize from JSON string."""
        obj = json.loads(raw)
        return cls(
            type=EventType(obj["type"]),
            data=obj.get("data", {}),
            timestamp=obj.get("timestamp", time.time()),
        )


================================================
FILE: researchclaw/server/websocket/manager.py
================================================
"""WebSocket connection manager."""

from __future__ import annotations

import asyncio
import logging
import time
from typing import Any

from fastapi import WebSocket

from .events import Event, EventType

logger = logging.getLogger(__name__)


class ConnectionManager:
    """Manage WebSocket connections and broadcast events."""

    def __init__(self) -> None:
        self._connections: dict[str, WebSocket] = {}
        self._event_queue: asyncio.Queue[Event] = asyncio.Queue()

    @property
    def active_count(self) -> int:
        return len(self._connections)

    async def connect(self, websocket: WebSocket, client_id: str) -> None:
        """Accept and register a WebSocket connection."""
        await websocket.accept()
        self._connections[client_id] = websocket
        logger.info("WebSocket connected: %s (total: %d)", client_id, self.active_count)

        await self._send(
            websocket,
            Event(type=EventType.CONNECTED, data={"client_id": client_id}),
        )

    def disconnect(self, client_id: str) -> None:
        """Remove a disconnected client."""
        self._connections.pop(client_id, None)
        logger.info("WebSocket disconnected: %s (total: %d)", client_id, self.active_count)

    async def broadcast(self, event: Event) -> None:
        """Send event to all connected clients."""
        dead: list[str] = []
        for cid, ws in self._connections.items():
            try:
                await self._send(ws, event)
            except Exception:
                dead.append(cid)
        for cid in dead:
            self.disconnect(cid)

    async def send_to(self, client_id: str, event: Event) -> None:
        """Send event to a specific client."""
        ws = self._connections.get(client_id)
        if ws:
            try:
                await self._send(ws, event)
            except Exception:
                self.disconnect(client_id)

    async def _send(self, ws: WebSocket, event: Event) -> None:
        await ws.send_text(event.to_json())

    def publish(self, event: Event) -> None:
        """Non-async publish for use from sync code (thread-safe queue)."""
        try:
            self._event_queue.put_nowait(event)
        except asyncio.QueueFull:
            logger.warning("Event queue full, dropping event: %s", event.type)

    async def drain_queue(self) -> None:
        """Process queued events and broadcast them."""
        while not self._event_queue.empty():
            event = self._event_queue.get_nowait()
            await self.broadcast(event)

    async def heartbeat_loop(self, interval: float = 15.0) -> None:
        """Send periodic heartbeat to all clients."""
        while True:
            await asyncio.sleep(interval)
            await self.broadcast(
                Event(
                    type=EventType.HEARTBEAT,
                    data={"active_clients": self.active_count},
                )
            )
            await self.drain_queue()


================================================
FILE: researchclaw/servers/__init__.py
================================================
"""Multi-server resource scheduling for AutoResearchClaw."""

from researchclaw.servers.registry import ServerRegistry
from researchclaw.servers.monitor import ServerMonitor
from researchclaw.servers.dispatcher import TaskDispatcher

__all__ = ["ServerRegistry", "ServerMonitor", "TaskDispatcher"]


================================================
FILE: researchclaw/servers/cloud_executor.py
================================================
"""Cloud executor: stub for AWS/GCP/Azure GPU instance management."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.servers.registry import ServerEntry

logger = logging.getLogger(__name__)


class CloudExecutor:
    """Manage cloud GPU instances for experiment execution.

    This is a stub implementation. Actual cloud provider APIs (boto3, google-cloud,
    azure-mgmt) are imported lazily to avoid hard dependencies.
    """

    def __init__(self, server: ServerEntry) -> None:
        if server.server_type != "cloud":
            raise ValueError(f"Server {server.name} is not a cloud server")
        self.server = server
        self.provider = server.cloud_provider

    async def launch_instance(self) -> dict[str, Any]:
        """Launch a cloud GPU instance."""
        logger.info(
            "Launching %s instance (%s) for %s",
            self.provider,
            self.server.cloud_instance_type,
            self.server.name,
        )
        # Stub: actual implementation would call provider SDK
        return {
            "provider": self.provider,
            "instance_type": self.server.cloud_instance_type,
            "status": "stub_launched",
            "instance_id": f"stub-{self.server.name}",
            "cost_per_hour": self.server.cost_per_hour,
        }

    async def terminate_instance(self, instance_id: str) -> None:
        """Terminate a cloud instance."""
        logger.info("Terminating instance %s on %s", instance_id, self.provider)

    async def get_instance_status(self, instance_id: str) -> dict[str, Any]:
        """Check instance status."""
        return {"instance_id": instance_id, "status": "stub_unknown"}


================================================
FILE: researchclaw/servers/dispatcher.py
================================================
"""Task dispatcher: route experiment tasks to the best available server."""

from __future__ import annotations

import asyncio
import logging
import uuid
from typing import Any

from researchclaw.servers.registry import ServerEntry, ServerRegistry
from researchclaw.servers.monitor import ServerMonitor
from researchclaw.servers.ssh_executor import SSHExecutor
from researchclaw.servers.slurm_executor import SlurmExecutor

logger = logging.getLogger(__name__)


class TaskDispatcher:
    """Dispatch experiment tasks to the best available server."""

    def __init__(
        self,
        registry: ServerRegistry,
        monitor: ServerMonitor,
        prefer_free: bool = True,
        failover: bool = True,
    ) -> None:
        self.registry = registry
        self.monitor = monitor
        self.prefer_free = prefer_free
        self.failover = failover
        self._tasks: dict[str, dict[str, Any]] = {}
        self._busy_servers: set[str] = set()

    async def dispatch(self, task: dict[str, Any]) -> str:
        """Dispatch a task to the best available server.

        Args:
            task: dict with keys: command, local_dir, requirements (optional)

        Returns:
            task_id for tracking
        """
        task_id = uuid.uuid4().hex[:12]
        requirements = task.get("requirements", {})

        # Find best server
        server = self.registry.get_best_match(
            requirements=requirements,
            prefer_free=self.prefer_free,
        )
        if server is None:
            self._tasks[task_id] = {"status": "queued", "task": task, "error": "No matching server"}
            logger.warning("No server available for task %s, queued", task_id)
            return task_id

        # Dispatch based on server type
        self._tasks[task_id] = {
            "status": "dispatched",
            "server": server.name,
            "task": task,
        }
        self._busy_servers.add(server.name)
        logger.info("Dispatched task %s to %s (%s)", task_id, server.name, server.server_type)
        return task_id

    async def execute_task(self, task_id: str) -> dict[str, Any]:
        """Execute a dispatched task on its assigned server."""
        info = self._tasks.get(task_id)
        if not info or info["status"] != "dispatched":
            return {"success": False, "error": "Task not dispatched"}

        server = self.registry.get(info["server"])
        task = info["task"]
        remote_dir = f"/tmp/researchclaw_{task_id}"

        try:
            if server.server_type == "slurm":
                executor = SlurmExecutor(server)
                job_id = await executor.submit_job(
                    command=task["command"],
                    remote_dir=remote_dir,
                    resources=task.get("requirements"),
                )
                info["status"] = "running"
                info["job_id"] = job_id
                return {"success": True, "job_id": job_id}
            else:
                # Default: SSH executor
                executor = SSHExecutor(server)  # type: ignore[assignment]
                result = await executor.run_experiment(
                    remote_dir=remote_dir,
                    command=task["command"],
                    timeout=task.get("timeout", 3600),
                )
                info["status"] = "completed" if result["success"] else "failed"
                info["result"] = result
                return result
        except Exception as exc:
            logger.error("Task %s failed: %s", task_id, exc)
            info["status"] = "failed"
            info["error"] = str(exc)

            # Failover: try another server (non-recursive, single attempt)
            if self.failover:
                tried = {server.name}
                alt = self.registry.get_best_match(
                    requirements=task.get("requirements"),
                    prefer_free=self.prefer_free,
                )
                if alt and alt.name not in tried:
                    logger.info("Failing over task %s to %s", task_id, alt.name)
                    info["server"] = alt.name
                    info["status"] = "dispatched"
                    try:
                        alt_server = self.registry.get(alt.name)
                        result = await alt_server.run_experiment(
                            remote_dir=task.get("remote_dir", ""),
                            command=task.get("command", ""),
                            timeout=task.get("timeout", 3600),
                        )
                        info["status"] = "completed"
                        return result
                    except Exception as alt_exc:
                        logger.error("Failover also failed: %s", alt_exc)

            return {"success": False, "error": str(exc)}
        finally:
            self._busy_servers.discard(server.name)

    def get_task_status(self, task_id: str) -> dict[str, Any]:
        """Get the status of a task."""
        info = self._tasks.get(task_id)
        if not info:
            return {"task_id": task_id, "status": "unknown"}
        return {
            "task_id": task_id,
            "status": info["status"],
            "server": info.get("server"),
            "error": info.get("error"),
        }


================================================
FILE: researchclaw/servers/monitor.py
================================================
"""Server monitor: check health and resource usage of registered servers."""

from __future__ import annotations

import asyncio
import logging
from typing import Any

from researchclaw.servers.registry import ServerEntry, ServerRegistry

logger = logging.getLogger(__name__)


class ServerMonitor:
    """Monitor health and resource usage of registered servers."""

    def __init__(self, registry: ServerRegistry) -> None:
        self.registry = registry
        self._status_cache: dict[str, dict[str, Any]] = {}

    async def check_status(self, server: ServerEntry) -> dict[str, Any]:
        """Check a single server's status via SSH (nvidia-smi, free, uptime)."""
        try:
            result = await _ssh_command(server.host, "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null; echo '---'; free -m | head -2; echo '---'; uptime")
            status = _parse_status_output(result, server)
            status["reachable"] = True
        except Exception as exc:
            logger.warning("Cannot reach server %s: %s", server.name, exc)
            status = {"reachable": False, "error": str(exc)}

        self._status_cache[server.name] = status
        return status

    async def check_all(self) -> dict[str, dict[str, Any]]:
        """Check all servers concurrently."""
        servers = self.registry.list_all()
        tasks = [self.check_status(s) for s in servers]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        out: dict[str, dict[str, Any]] = {}
        for server, result in zip(servers, results):
            if isinstance(result, Exception):
                out[server.name] = {"reachable": False, "error": str(result)}
            else:
                out[server.name] = result
        return out

    def get_cached(self, name: str) -> dict[str, Any] | None:
        """Return cached status for a server."""
        return self._status_cache.get(name)

    def get_gpu_usage(self, server: ServerEntry) -> dict[str, Any]:
        """Return cached GPU usage for a server (sync convenience)."""
        cached = self._status_cache.get(server.name, {})
        return cached.get("gpu", {})


async def _ssh_command(host: str, command: str) -> str:
    """Run a command on a remote host via SSH."""
    proc = await asyncio.create_subprocess_exec(
        "ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
        host, command,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate()
    if proc.returncode != 0:
        raise RuntimeError(f"SSH command failed (rc={proc.returncode}): {stderr.decode().strip()}")
    return stdout.decode()


def _parse_status_output(raw: str, server: ServerEntry) -> dict[str, Any]:
    """Parse combined nvidia-smi + free + uptime output."""
    sections = raw.split("---")
    status: dict[str, Any] = {"server": server.name, "host": server.host}

    # GPU section
    if len(sections) >= 1:
        gpu_lines = [l.strip() for l in sections[0].strip().splitlines() if l.strip()]
        gpus = []
        for line in gpu_lines:
            parts = [p.strip() for p in line.split(",")]
            if len(parts) >= 3:
                gpus.append({
                    "utilization_pct": int(parts[0]),
                    "memory_used_mb": int(parts[1]),
                    "memory_total_mb": int(parts[2]),
                })
        status["gpu"] = {"count": len(gpus), "devices": gpus}

    # Memory section
    if len(sections) >= 2:
        mem_lines = sections[1].strip().splitlines()
        if len(mem_lines) >= 2:
            parts = mem_lines[1].split()
            if len(parts) >= 4:
                status["memory"] = {
                    "total_mb": int(parts[1]),
                    "used_mb": int(parts[2]),
                    "free_mb": int(parts[3]),
                }

    # Uptime section
    if len(sections) >= 3:
        status["uptime"] = sections[2].strip()

    return status


================================================
FILE: researchclaw/servers/registry.py
================================================
"""Server registry: manage available compute servers."""

from __future__ import annotations

import logging
from typing import Any

logger = logging.getLogger(__name__)


class ServerEntry:
    """A compute server that can run experiments."""

    def __init__(
        self,
        name: str,
        host: str,
        server_type: str = "ssh",
        gpu: str = "",
        vram_gb: int = 0,
        priority: int = 1,
        scheduler: str = "",
        cloud_provider: str = "",
        cloud_instance_type: str = "",
        cost_per_hour: float = 0.0,
    ) -> None:
        self.name = name
        self.host = host
        self.server_type = server_type  # ssh | slurm | cloud
        self.gpu = gpu
        self.vram_gb = vram_gb
        self.priority = priority
        self.scheduler = scheduler  # slurm | pbs | lsf
        self.cloud_provider = cloud_provider  # aws | gcp | azure
        self.cloud_instance_type = cloud_instance_type
        self.cost_per_hour = cost_per_hour

    def to_dict(self) -> dict[str, Any]:
        return {
            "name": self.name,
            "host": self.host,
            "server_type": self.server_type,
            "gpu": self.gpu,
            "vram_gb": self.vram_gb,
            "priority": self.priority,
            "scheduler": self.scheduler,
            "cloud_provider": self.cloud_provider,
            "cloud_instance_type": self.cloud_instance_type,
            "cost_per_hour": self.cost_per_hour,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> ServerEntry:
        return cls(
            name=data["name"],
            host=data.get("host", ""),
            server_type=data.get("server_type", "ssh"),
            gpu=data.get("gpu", ""),
            vram_gb=int(data.get("vram_gb", 0)),
            priority=int(data.get("priority", 1)),
            scheduler=data.get("scheduler", ""),
            cloud_provider=data.get("cloud_provider", ""),
            cloud_instance_type=data.get("cloud_instance_type", ""),
            cost_per_hour=float(data.get("cost_per_hour", 0.0)),
        )


class ServerRegistry:
    """Registry of available compute servers."""

    def __init__(self, servers: list[ServerEntry] | None = None) -> None:
        self._servers: dict[str, ServerEntry] = {}
        for s in (servers or []):
            self._servers[s.name] = s

    def add(self, server: ServerEntry) -> None:
        """Register a new server."""
        self._servers[server.name] = server
        logger.info("Registered server: %s (%s)", server.name, server.host)

    def remove(self, name: str) -> None:
        """Remove a server from the registry."""
        if name not in self._servers:
            raise KeyError(f"Unknown server: {name}")
        del self._servers[name]

    def get(self, name: str) -> ServerEntry:
        """Get a server by name."""
        if name not in self._servers:
            raise KeyError(f"Unknown server: {name}")
        return self._servers[name]

    def list_all(self) -> list[ServerEntry]:
        """Return all registered servers sorted by priority (lower = higher priority)."""
        return sorted(self._servers.values(), key=lambda s: s.priority)

    def get_available(self, exclude: set[str] | None = None) -> list[ServerEntry]:
        """Return servers not in the exclude set, sorted by priority."""
        excluded = exclude or set()
        return [s for s in self.list_all() if s.name not in excluded]

    def get_best_match(
        self,
        requirements: dict[str, Any] | None = None,
        prefer_free: bool = True,
    ) -> ServerEntry | None:
        """Find the best server matching resource requirements.

        Args:
            requirements: dict with optional keys: min_vram_gb, server_type, gpu
            prefer_free: prefer servers with cost_per_hour == 0
        """
        reqs = requirements or {}
        candidates = self.list_all()

        # Filter by minimum VRAM
        min_vram = reqs.get("min_vram_gb", 0)
        if min_vram:
            candidates = [s for s in candidates if s.vram_gb >= min_vram]

        # Filter by server type
        stype = reqs.get("server_type")
        if stype:
            candidates = [s for s in candidates if s.server_type == stype]

        # Filter by GPU model substring
        gpu_req = reqs.get("gpu")
        if gpu_req:
            candidates = [s for s in candidates if gpu_req.lower() in s.gpu.lower()]

        if not candidates:
            return None

        # Sort: prefer free servers, then by priority
        if prefer_free:
            candidates.sort(key=lambda s: (s.cost_per_hour > 0, s.priority))

        return candidates[0]

    @property
    def count(self) -> int:
        return len(self._servers)


================================================
FILE: researchclaw/servers/slurm_executor.py
================================================
"""Slurm HPC executor: submit, monitor, and cancel batch jobs."""

from __future__ import annotations

import asyncio
import logging
import textwrap
from typing import Any

from researchclaw.servers.registry import ServerEntry

logger = logging.getLogger(__name__)


class SlurmExecutor:
    """Submit and manage Slurm batch jobs via SSH."""

    def __init__(self, server: ServerEntry) -> None:
        if server.server_type != "slurm":
            raise ValueError(f"Server {server.name} is not a slurm server")
        self.server = server
        self.host = server.host

    def _generate_sbatch_script(
        self,
        command: str,
        job_name: str = "researchclaw",
        resources: dict[str, Any] | None = None,
    ) -> str:
        """Generate an sbatch submission script."""
        res = resources or {}
        gpus = res.get("gpus", 1)
        mem = res.get("mem_gb", 16)
        time_limit = res.get("time", "01:00:00")
        partition = res.get("partition", "")

        lines = [
            "#!/bin/bash",
            f"#SBATCH --job-name={job_name}",
            f"#SBATCH --gres=gpu:{gpus}",
            f"#SBATCH --mem={mem}G",
            f"#SBATCH --time={time_limit}",
            "#SBATCH --output=slurm-%j.out",
            "#SBATCH --error=slurm-%j.err",
        ]
        if partition:
            lines.append(f"#SBATCH --partition={partition}")
        lines.append("")
        lines.append(command)
        return "\n".join(lines)

    async def submit_job(
        self,
        command: str,
        remote_dir: str,
        job_name: str = "researchclaw",
        resources: dict[str, Any] | None = None,
    ) -> str:
        """Submit a Slurm job and return the job ID."""
        script = self._generate_sbatch_script(command, job_name, resources)
        # Write script and submit via SSH
        import shlex as _shlex
        ssh_cmd = (
            f"cd {_shlex.quote(remote_dir)} && "
            f"cat <<'EOFSCRIPT' > _job.sh\n{script}\nEOFSCRIPT\n"
            f"&& sbatch _job.sh"
        )
        proc = await asyncio.create_subprocess_exec(
            "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no",
            self.host, ssh_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout, stderr = await proc.communicate()
        if proc.returncode != 0:
            raise RuntimeError(f"sbatch failed: {stderr.decode().strip()}")

        # Parse "Submitted batch job 12345"
        output = stdout.decode().strip()
        parts = output.split()
        if len(parts) >= 4 and parts[-1].isdigit():
            job_id = parts[-1]
            logger.info("Submitted Slurm job %s on %s", job_id, self.server.name)
            return job_id
        raise RuntimeError(f"Could not parse sbatch output: {output}")

    async def check_job(self, job_id: str) -> dict[str, Any]:
        """Check job status via squeue/sacct."""
        proc = await asyncio.create_subprocess_exec(
            "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no",
            self.host,
            f"squeue -j {job_id} -h -o '%T' 2>/dev/null || sacct -j {job_id} -n -o State -P 2>/dev/null",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout, _ = await proc.communicate()
        state = stdout.decode().strip().split("\n")[0].strip() if stdout else "UNKNOWN"
        return {"job_id": job_id, "state": state}

    async def cancel_job(self, job_id: str) -> None:
        """Cancel a running job."""
        proc = await asyncio.create_subprocess_exec(
            "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no",
            self.host, f"scancel {job_id}",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        await proc.communicate()
        logger.info("Cancelled Slurm job %s on %s", job_id, self.server.name)


================================================
FILE: researchclaw/servers/ssh_executor.py
================================================
"""SSH remote executor: upload code, run experiments, download results."""

from __future__ import annotations

import asyncio
import logging
import shlex
from pathlib import Path
from typing import Any

from researchclaw.servers.registry import ServerEntry

logger = logging.getLogger(__name__)


class SSHExecutor:
    """Execute experiments on remote servers via SSH/rsync."""

    def __init__(self, server: ServerEntry) -> None:
        self.server = server
        self.host = server.host

    async def upload_code(self, local_dir: Path, remote_dir: str) -> None:
        """Upload experiment code via rsync."""
        local = str(local_dir.resolve()) + "/"
        remote = f"{self.host}:{remote_dir}/"
        logger.info("Uploading %s -> %s", local, remote)
        proc = await asyncio.create_subprocess_exec(
            "rsync", "-az", "--delete",
            "-e", "ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no",
            local, remote,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        _, stderr = await proc.communicate()
        if proc.returncode != 0:
            raise RuntimeError(f"rsync upload failed: {stderr.decode().strip()}")

    async def run_experiment(
        self,
        remote_dir: str,
        command: str,
        timeout: int = 3600,
    ) -> dict[str, Any]:
        """Run an experiment command on the remote server."""
        full_cmd = f"cd {shlex.quote(remote_dir)} && {command}"
        logger.info("Running on %s: %s", self.host, full_cmd)
        proc = await asyncio.create_subprocess_exec(
            "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no",
            self.host, full_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        try:
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
        except asyncio.TimeoutError:
            proc.kill()
            await proc.wait()
            return {"success": False, "error": f"Timeout after {timeout}s", "returncode": -1}

        return {
            "success": proc.returncode == 0,
            "stdout": stdout.decode(),
            "stderr": stderr.decode(),
            "returncode": proc.returncode,
        }

    async def download_results(self, remote_dir: str, local_dir: Path) -> None:
        """Download experiment results via rsync."""
        local_dir.mkdir(parents=True, exist_ok=True)
        remote = f"{self.host}:{remote_dir}/"
        local = str(local_dir.resolve()) + "/"
        logger.info("Downloading %s -> %s", remote, local)
        proc = await asyncio.create_subprocess_exec(
            "rsync", "-az",
            "-e", "ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no",
            remote, local,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        _, stderr = await proc.communicate()
        if proc.returncode != 0:
            raise RuntimeError(f"rsync download failed: {stderr.decode().strip()}")

    async def cleanup(self, remote_dir: str) -> None:
        """Remove remote experiment directory."""
        logger.info("Cleaning up %s:%s", self.host, remote_dir)
        proc = await asyncio.create_subprocess_exec(
            "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no",
            self.host, f"rm -rf {shlex.quote(remote_dir)}",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        await proc.communicate()


================================================
FILE: researchclaw/skills/__init__.py
================================================
"""Dynamic skills library for AutoResearchClaw.

Provides a registry of reusable research/engineering/writing skills
that can be automatically matched to pipeline stages and injected
into LLM prompts.
"""

from researchclaw.skills.schema import Skill
from researchclaw.skills.registry import SkillRegistry

__all__ = ["Skill", "SkillRegistry"]


================================================
FILE: researchclaw/skills/builtin/__init__.py
================================================


================================================
FILE: researchclaw/skills/builtin/domain/cv-classification/SKILL.md
================================================
---
name: cv-classification
description: Best practices for image classification tasks. Use when working on CIFAR, ImageNet, or other classification benchmarks.
metadata:
  category: domain
  trigger-keywords: "classification,image,cifar,imagenet,resnet,vision,cnn,vit"
  applicable-stages: "9,10"
  priority: "3"
  version: "1.0"
  author: researchclaw
  references: "He et al., Deep Residual Learning, CVPR 2016; Dosovitskiy et al., An Image is Worth 16x16 Words, ICLR 2021"
---

## Image Classification Best Practice
Architecture selection:
- Small scale (CIFAR-10/100): ResNet-18/34, WideResNet, Simple ViT
- Medium scale: ResNet-50, EfficientNet-B0/B1, DeiT-Small
- Large scale: ViT-B/16, ConvNeXt, Swin Transformer

Training recipe:
- Optimizer: AdamW (lr=1e-3 to 3e-4) or SGD (lr=0.1 with cosine decay)
- Weight decay: 0.01-0.1 for AdamW, 5e-4 for SGD
- Data augmentation: RandomCrop, RandomHorizontalFlip, Cutout/CutMix
- Warmup: 5-10 epochs linear warmup for transformers
- Batch size: 128-256 for CNNs, 512-1024 for ViTs (if memory allows)

Standard benchmarks:
- CIFAR-10: ~96% (ResNet-18), ~97% (WideResNet)
- CIFAR-100: ~80% (ResNet-18), ~84% (WideResNet)
- ImageNet: ~76% (ResNet-50), ~81% (ViT-B/16)


================================================
FILE: researchclaw/skills/builtin/domain/cv-detection/SKILL.md
================================================
---
name: cv-detection
description: Best practices for object detection tasks. Use when working on COCO, VOC, or detection architectures like YOLO and DETR.
metadata:
  category: domain
  trigger-keywords: "detection,object,bbox,yolo,coco,anchor,faster rcnn"
  applicable-stages: "9,10"
  priority: "5"
  version: "1.0"
  author: researchclaw
  references: "Ren et al., Faster R-CNN, NeurIPS 2015; Carion et al., End-to-End Object Detection with Transformers, ECCV 2020"
---

## Object Detection Best Practice
Architecture families:
- One-stage: YOLO (v5/v8), SSD, RetinaNet, FCOS
- Two-stage: Faster R-CNN, Cascade R-CNN
- Transformer: DETR, DINO, RT-DETR

Training recipe:
- Use pre-trained backbone (ImageNet)
- Multi-scale training and testing
- IoU threshold: 0.5 for mAP50, 0.5:0.95 for mAP
- Use FPN for multi-scale feature extraction
- Focal loss for class imbalance in one-stage detectors

Standard benchmarks:
- COCO val2017: ~37 mAP (Faster R-CNN R50), ~51 mAP (DINO Swin-L)
- Pascal VOC: ~80 mAP50 (Faster R-CNN)


================================================
FILE: researchclaw/skills/builtin/domain/nlp-alignment/SKILL.md
================================================
---
name: nlp-alignment
description: Best practices for LLM alignment techniques including RLHF, DPO, and instruction tuning. Use when working on alignment or safety.
metadata:
  category: domain
  trigger-keywords: "alignment,rlhf,dpo,reward model,preference,instruction tuning,safety"
  applicable-stages: "9,10"
  priority: "4"
  version: "1.0"
  author: researchclaw
  references: "Ouyang et al., Training language models to follow instructions, NeurIPS 2022; Rafailov et al., DPO, NeurIPS 2023"
---

## LLM Alignment Best Practice
Methods:
- RLHF: Train reward model → PPO fine-tuning (complex but powerful)
- DPO: Direct preference optimization (simpler, no reward model needed)
- GRPO: Group relative policy optimization
- SFT: Supervised fine-tuning as alignment baseline

Training recipe:
- Start with SFT on high-quality instruction data
- DPO: lr=5e-7, beta=0.1, batch_size=64
- PPO: lr=1e-6, clip=0.2, KL coeff=0.02
- Use reference model for KL penalty
- Evaluate on safety benchmarks (TruthfulQA, BBQ, etc.)

Common pitfalls:
- Reward hacking: model finds shortcuts to high reward
- Mode collapse: model generates repetitive outputs
- Catastrophic forgetting: loses general capabilities


================================================
FILE: researchclaw/skills/builtin/domain/nlp-pretraining/SKILL.md
================================================
---
name: nlp-pretraining
description: Best practices for language model pretraining and fine-tuning. Use when generating or reviewing NLP training code.
metadata:
  category: domain
  trigger-keywords: "language model,pretraining,fine-tuning,bert,gpt,llm,transformer,nlp,text"
  applicable-stages: "9,10"
  priority: "3"
  version: "1.0"
  author: researchclaw
  references: "Devlin et al., BERT, NAACL 2019; Hu et al., LoRA, ICLR 2022"
---

## NLP Pretraining/Fine-tuning Best Practice
Fine-tuning recipe:
- Use pre-trained checkpoints (HuggingFace hub)
- AdamW optimizer, lr=2e-5 to 5e-5
- Linear warmup (6% of total steps) + linear decay
- Batch size: 16-32 (use gradient accumulation for larger effective batch)
- 3-5 epochs for classification, 1-2 for generation
- Weight decay: 0.01

Parameter-efficient methods:
- LoRA: r=8-64, alpha=16-128, apply to q/v projections
- Prefix tuning: 10-20 prefix tokens
- Adapters: bottleneck dimension 64-256

Evaluation:
- Classification: accuracy, F1 (macro for imbalanced)
- Generation: perplexity, BLEU/ROUGE, human evaluation
- Use multiple seeds and report mean +/- std


================================================
FILE: researchclaw/skills/builtin/domain/rl-policy-optimization/SKILL.md
================================================
---
name: rl-policy-optimization
description: Best practices for reinforcement learning policy optimization. Use when working on RL agents, PPO, SAC, or reward design.
metadata:
  category: domain
  trigger-keywords: "reinforcement learning,rl,policy,reward,agent,environment,ppo,sac"
  applicable-stages: "9,10"
  priority: "3"
  version: "1.0"
  author: researchclaw
  references: "Schulman et al., Proximal Policy Optimization, 2017; Haarnoja et al., Soft Actor-Critic, ICML 2018"
---

## RL Policy Optimization Best Practice
Algorithm selection:
- Discrete actions: PPO, DQN, A2C
- Continuous actions: SAC, TD3, PPO
- Multi-agent: MAPPO, QMIX
- Offline: CQL, IQL, Decision Transformer

Training recipe:
- PPO: clip=0.2, lr=3e-4, gamma=0.99, GAE lambda=0.95
- SAC: lr=3e-4, tau=0.005, auto-tune alpha
- Use vectorized environments (e.g., gymnasium.vector)
- Normalize observations and rewards
- Log episode return, episode length, value loss, policy entropy

Evaluation:
- Report mean +/- std over 10+ evaluation episodes
- Use deterministic policy for evaluation
- Compare against random policy and simple baselines
- Report sample efficiency (return vs. env steps)

Common pitfalls:
- Reward shaping can introduce bias
- Seed sensitivity is HIGH — use 5+ seeds
- Hyperparameter sensitivity — do a small sweep


================================================
FILE: researchclaw/skills/builtin/experiment/experimental-design/SKILL.md
================================================
---
name: experimental-design
description: Best practices for designing reproducible ML experiments. Use when planning ablations, baselines, or controlled experiments.
metadata:
  category: experiment
  trigger-keywords: "experiment,ablation,baseline,control,hypothesis,reproducib"
  applicable-stages: "9,10,12"
  priority: "2"
  version: "1.0"
  author: researchclaw
  references: "Bouthillier et al., Accounting for Variance in ML Benchmarks, MLSys 2021"
---

## Experimental Design Best Practice
1. ALWAYS include meaningful baselines (not just random):
   - At least one classical method baseline
   - At least one recent SOTA method baseline
   - A simple-but-strong baseline (e.g., linear probe, k-NN)
2. Use MULTIPLE random seeds (minimum 3, ideally 5)
3. Report mean +/- std across seeds
4. Design ablations that isolate EACH key component:
   - Remove one component at a time
   - Each ablation must be meaningfully different from baseline
5. Control variables: change only ONE thing per comparison
6. Use standard splits (train/val/test) — never test on training data
7. Report wall-clock time and memory usage alongside accuracy


================================================
FILE: researchclaw/skills/builtin/experiment/meta-analysis/SKILL.md
================================================
---
name: meta-analysis
description: Statistical methods for combining results across multiple studies. Use when aggregating cross-study or cross-experiment results.
metadata:
  category: experiment
  trigger-keywords: "meta-analysis,effect size,pooled,cross-study,aggregat"
  applicable-stages: "7,14"
  priority: "5"
  version: "1.0"
  author: researchclaw
  references: "Borenstein et al., Introduction to Meta-Analysis, 2009"
---

## Meta-Analysis Best Practice
When comparing results across studies or experiments:
1. Report effect sizes, not just p-values
2. Use standardized metrics for cross-study comparison
3. Account for heterogeneity (different setups, datasets, seeds)
4. Report confidence intervals alongside point estimates
5. Use forest plots to visualize cross-study comparisons
6. Identify and discuss outliers or inconsistent results
7. Consider publication bias when interpreting aggregate results


================================================
FILE: researchclaw/skills/builtin/experiment/systematic-review/SKILL.md
================================================
---
name: systematic-review
description: Structured methodology for comprehensive literature review following PRISMA guidelines. Use during literature search and screening stages.
metadata:
  category: experiment
  trigger-keywords: "literature,review,survey,related work,prior work"
  applicable-stages: "3,4,5,6"
  priority: "3"
  version: "1.0"
  author: researchclaw
  references: "Page et al., The PRISMA 2020 statement, BMJ 2021"
---

## Systematic Review Best Practice
Follow PRISMA-like methodology for literature search:
1. Define clear inclusion/exclusion criteria BEFORE searching
2. Use multiple databases (Semantic Scholar, arXiv, OpenAlex)
3. Search with both broad and narrow queries
4. Screen by title/abstract first, then full text
5. Extract: method, dataset, metrics, key findings
6. Synthesize gaps and opportunities, not just summaries
7. Prioritize recent (last 2-3 years) high-citation papers
8. Include at least one seminal/foundational paper per sub-topic


================================================
FILE: researchclaw/skills/builtin/tooling/data-loading/SKILL.md
================================================
---
name: data-loading
description: Optimize data loading pipeline to prevent GPU starvation. Use when setting up DataLoader or data preprocessing.
metadata:
  category: tooling
  trigger-keywords: "data,loading,dataloader,dataset,preprocessing,augmentation"
  applicable-stages: "10"
  priority: "6"
  version: "1.0"
  author: researchclaw
  references: "PyTorch Data Loading Tutorial, pytorch.org"
---

## Efficient Data Loading Best Practice
1. Use num_workers = min(8, os.cpu_count()) for DataLoader
2. Enable pin_memory=True when using GPU
3. Use persistent_workers=True to avoid re-spawning
4. Pre-compute and cache transformations when possible
5. For image data: use torchvision.transforms.v2 (faster)
6. For large datasets: consider memory-mapped files or WebDataset
7. Profile with torch.utils.bottleneck to find I/O bottlenecks


================================================
FILE: researchclaw/skills/builtin/tooling/distributed-training/SKILL.md
================================================
---
name: distributed-training
description: Multi-GPU and distributed training patterns with PyTorch DDP. Use when scaling training across GPUs.
metadata:
  category: tooling
  trigger-keywords: "distributed,multi-gpu,parallel,ddp,scale"
  applicable-stages: "10,12"
  priority: "7"
  version: "1.0"
  author: researchclaw
  references: "PyTorch DDP Tutorial, pytorch.org; Goyal et al., Accurate Large Minibatch SGD, 2017"
---

## Distributed Training Best Practice
1. Use DistributedDataParallel (DDP) over DataParallel for multi-GPU
2. Initialize process group: dist.init_process_group(backend='nccl')
3. Use DistributedSampler for data sharding
4. Synchronize batch norm: nn.SyncBatchNorm.convert_sync_batchnorm()
5. Only save checkpoint on rank 0
6. Scale learning rate linearly with world size
7. Use gradient accumulation for effectively larger batch sizes


================================================
FILE: researchclaw/skills/builtin/tooling/mixed-precision/SKILL.md
================================================
---
name: mixed-precision
description: Use FP16/BF16 mixed precision to accelerate training and reduce memory. Use when optimizing GPU performance.
metadata:
  category: tooling
  trigger-keywords: "training,gpu,memory,speed,precision,fp16,bf16"
  applicable-stages: "10,12"
  priority: "5"
  version: "1.0"
  author: researchclaw
  references: "Micikevicius et al., Mixed Precision Training, ICLR 2018"
  code-template: |
    scaler = torch.cuda.amp.GradScaler()
    for batch in dataloader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            output = model(batch)
            loss = criterion(output, target)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
---

## Mixed Precision Training Best Practice
Use torch.cuda.amp for automatic mixed precision:
- Wrap forward pass in torch.cuda.amp.autocast()
- Use GradScaler for loss scaling
- BF16 preferred over FP16 on Ampere+ GPUs (RTX 3xxx, A100, RTX 4xxx)
- Watch for NaN gradients — reduce learning rate if needed
- Do NOT use amp with custom CUDA kernels unless tested


================================================
FILE: researchclaw/skills/builtin/tooling/pytorch-training/SKILL.md
================================================
---
name: pytorch-training
description: Best practices for building robust PyTorch training loops. Use when generating or reviewing ML training code.
metadata:
  category: tooling
  trigger-keywords: "training,pytorch,torch,deep learning,neural network,model"
  applicable-stages: "10,12"
  priority: "3"
  version: "1.0"
  author: researchclaw
  references: "PyTorch Performance Tuning Guide, pytorch.org"
  code-template: |
    import torch
    import torch.nn as nn
    from torch.utils.data import DataLoader

    # Reproducibility
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad(set_to_none=True)
            loss = criterion(model(batch['input']), batch['target'])
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        scheduler.step()
---

## PyTorch Training Best Practice
1. Use torch.manual_seed() for reproducibility (set for torch, numpy, random)
2. Use DataLoader with num_workers>0 and pin_memory=True for GPU
3. Enable cudnn.benchmark=True for fixed input sizes
4. Use learning rate schedulers (CosineAnnealingLR or OneCycleLR)
5. Implement early stopping based on validation metric
6. Log metrics every epoch, save best model checkpoint
7. Use torch.no_grad() for evaluation
8. Clear gradients with optimizer.zero_grad(set_to_none=True) for efficiency


================================================
FILE: researchclaw/skills/loader.py
================================================
"""Skill file loader — supports YAML, JSON, and SKILL.md (agentskills.io)."""

from __future__ import annotations

import json
import logging
from pathlib import Path
import yaml

from researchclaw.skills.schema import Skill

logger = logging.getLogger(__name__)


# ── SKILL.md loader ──────────────────────────────────────────────────


def load_skill_from_skillmd(path: Path) -> Skill | None:
    """Load a skill from a ``SKILL.md`` file (agentskills.io format).

    Expected layout::

        ---
        name: kebab-case-id
        description: one-liner
        metadata:
          category: domain
          trigger-keywords: "kw1,kw2"
        ---

        Markdown body here ...

    Args:
        path: Path to the SKILL.md file.

    Returns:
        Parsed :class:`Skill`, or *None* on failure.
    """
    try:
        text = path.read_text(encoding="utf-8")
    except Exception as exc:
        logger.warning("Failed to read SKILL.md at %s: %s", path, exc)
        return None

    # Split on YAML frontmatter markers
    parts = text.split("---", 2)
    if len(parts) < 3:
        logger.warning("SKILL.md missing frontmatter delimiters: %s", path)
        return None

    try:
        header = yaml.safe_load(parts[1])
    except Exception as exc:
        logger.warning("Invalid YAML frontmatter in %s: %s", path, exc)
        return None

    if not isinstance(header, dict):
        logger.warning("Frontmatter is not a dict in %s", path)
        return None

    name = str(header.get("name", ""))
    if not name:
        logger.warning("SKILL.md missing 'name' field: %s", path)
        return None

    description = str(header.get("description", ""))
    body = parts[2].strip()

    # Build metadata — flatten nested 'metadata' dict from frontmatter
    metadata: dict[str, str] = {}
    raw_meta = header.get("metadata")
    if isinstance(raw_meta, dict):
        for k, v in raw_meta.items():
            metadata[str(k)] = str(v)

    # Also pull top-level keys that map to metadata
    for key in ("category", "license", "compatibility", "version", "author"):
        if key in header and key not in metadata:
            metadata[key] = str(header[key])

    skill_license = str(header.get("license", ""))
    compatibility = str(header.get("compatibility", ""))

    return Skill(
        name=name,
        description=description,
        body=body,
        license=skill_license,
        compatibility=compatibility,
        metadata=metadata,
        source_dir=path.parent,
        source_format="skillmd",
    )


def load_skillmd_from_directory(directory: Path) -> list[Skill]:
    """Scan *directory* for ``*/SKILL.md`` sub-directories.

    Each immediate sub-directory containing a ``SKILL.md`` file is
    treated as a single skill.
    """
    skills: list[Skill] = []
    if not directory.exists():
        return skills

    for skill_md in sorted(directory.rglob("SKILL.md")):
        skill = load_skill_from_skillmd(skill_md)
        if skill:
            skills.append(skill)

    return skills


# ── Legacy YAML / JSON loader ────────────────────────────────────────


def load_skill_file(path: Path) -> Skill | None:
    """Load a single skill from a YAML or JSON file.

    Args:
        path: Path to the skill file.

    Returns:
        Parsed Skill object, or None if loading fails.
    """
    try:
        text = path.read_text(encoding="utf-8")
        if path.suffix in (".yaml", ".yml"):
            data = yaml.safe_load(text)
        elif path.suffix == ".json":
            data = json.loads(text)
        else:
            logger.warning("Unsupported skill file format: %s", path)
            return None

        if not isinstance(data, dict):
            logger.warning("Skill file is not a dict: %s", path)
            return None

        skill = Skill.from_dict(data)
        if not skill.name:
            logger.warning("Skill missing name/id: %s", path)
            return None

        return skill
    except Exception as exc:
        logger.warning("Failed to load skill from %s: %s", path, exc)
        return None


def load_skills_from_directory(directory: Path) -> list[Skill]:
    """Recursively load all skills from a directory.

    Supports both ``SKILL.md`` (agentskills.io) and legacy YAML/JSON.
    When both formats exist for the same skill name, SKILL.md wins.

    Args:
        directory: Root directory to scan.

    Returns:
        List of successfully loaded Skill objects.
    """
    skills_by_name: dict[str, Skill] = {}
    if not directory.exists():
        return []

    # 1. Load SKILL.md files first (higher priority)
    for skill in load_skillmd_from_directory(directory):
        skills_by_name[skill.name] = skill

    # 2. Load legacy YAML/JSON (only if no SKILL.md with same name)
    for pattern in ("*.yaml", "*.yml", "*.json"):
        for path in sorted(directory.rglob(pattern)):
            if path.name == "__init__.py":
                continue
            skill = load_skill_file(path)
            if skill and skill.name not in skills_by_name:
                skills_by_name[skill.name] = skill

    skills = list(skills_by_name.values())
    logger.info("Loaded %d skills from %s", len(skills), directory)
    return skills


================================================
FILE: researchclaw/skills/matcher.py
================================================
"""Skill-to-stage matching engine."""

from __future__ import annotations

import logging
import re

from researchclaw.skills.schema import STAGE_NAME_TO_NUMBER, Skill

logger = logging.getLogger(__name__)


def _tokenize(text: str) -> set[str]:
    """Extract lowercase tokens from text."""
    return set(re.findall(r"[a-z0-9_]+", text.lower()))


def _resolve_stage(stage: int | str) -> int:
    """Convert a stage name to its number, or pass through an int."""
    if isinstance(stage, int):
        return stage
    return STAGE_NAME_TO_NUMBER.get(stage, -1)


def match_skills(
    skills: list[Skill],
    context: str,
    stage: int | str,
    top_k: int = 3,
    *,
    fallback_matching: bool = True,
) -> list[Skill]:
    """Match skills to the current context and stage.

    Scoring:
    - Stage applicability (must match, or empty = all stages)
    - Keyword overlap with context
    - Description-based fallback at 0.5x discount (for skills without trigger_keywords)
    - Priority (lower = higher priority)

    Args:
        skills: Available skills to match against.
        context: Current task context text.
        stage: Current pipeline stage number or name.
        top_k: Maximum number of skills to return.
        fallback_matching: Enable description-based matching for skills
            without trigger_keywords.

    Returns:
        List of matched skills sorted by relevance.
    """
    stage_num = _resolve_stage(stage)
    context_tokens = _tokenize(context)
    scored: list[tuple[float, Skill]] = []

    for skill in skills:
        # Filter by stage applicability
        if skill.applicable_stages and stage_num not in skill.applicable_stages:
            continue

        # Keyword matching score
        keyword_score = 0.0
        has_keywords = bool(skill.trigger_keywords)
        for kw in skill.trigger_keywords:
            kw_tokens = _tokenize(kw)
            if kw_tokens & context_tokens:
                keyword_score += 1.0

        # Description-based fallback for external skills without keywords
        if keyword_score == 0.0 and not has_keywords and fallback_matching:
            desc_tokens = _tokenize(skill.description)
            overlap = len(desc_tokens & context_tokens)
            if overlap > 0:
                keyword_score = overlap * 0.5  # 0.5x discount
                max_possible = max(len(desc_tokens), 1)
                normalized_kw = keyword_score / max_possible
            else:
                continue
        elif keyword_score == 0.0:
            continue
        else:
            max_possible = max(len(skill.trigger_keywords), 1)
            normalized_kw = keyword_score / max_possible

        # Priority adjustment (priority 1 → boost 0.5, priority 10 → boost 0.0)
        priority_boost = (10 - skill.priority) / 20.0

        total_score = normalized_kw + priority_boost
        scored.append((total_score, skill))

    scored.sort(key=lambda x: (-x[0], x[1].priority))
    return [skill for _, skill in scored[:top_k]]


def format_skills_for_prompt(skills: list[Skill], max_chars: int = 4000) -> str:
    """Format matched skills as prompt injection text.

    Uses ``skill.body`` as primary content.  Truncates long bodies
    (common with external skills) to ``max_chars / len(skills)`` per skill.

    Args:
        skills: List of matched skills.
        max_chars: Maximum character limit.

    Returns:
        Formatted string for LLM prompt injection.
    """
    if not skills:
        return ""

    per_skill_budget = max_chars // max(len(skills), 1)
    parts: list[str] = []
    total_len = 0

    for skill in skills:
        content = skill.body or skill.prompt_template
        # Truncate long bodies
        if len(content) > per_skill_budget:
            content = content[:per_skill_budget - 20] + "\n\n[... truncated]"

        section = f"### {skill.name} ({skill.category})\n{content}"
        if skill.code_template:
            section += f"\n**Code Template:**\n```python\n{skill.code_template}\n```"
        if skill.references:
            section += "\n**References:** " + "; ".join(skill.references)

        if total_len + len(section) > max_chars:
            break
        parts.append(section)
        total_len += len(section)

    return "\n\n".join(parts)


================================================
FILE: researchclaw/skills/registry.py
================================================
"""Skill registry — central hub for loading and querying skills."""

from __future__ import annotations

import logging
from pathlib import Path

from researchclaw.skills.loader import load_skills_from_directory
from researchclaw.skills.matcher import format_skills_for_prompt, match_skills
from researchclaw.skills.schema import Skill

logger = logging.getLogger(__name__)

# Default builtin directory relative to this file
_BUILTIN_DIR = Path(__file__).parent / "builtin"


class SkillRegistry:
    """Central registry for managing and querying skills.

    Loads builtin skills on init, then optionally loads custom and
    external skills from user-specified directories.
    """

    def __init__(
        self,
        builtin_dir: str | Path = "",
        custom_dirs: tuple[str, ...] | list[str] = (),
        external_dirs: tuple[str, ...] | list[str] = (),
        auto_match: bool = True,
        max_skills_per_stage: int = 3,
        fallback_matching: bool = True,
    ) -> None:
        self._skills: dict[str, Skill] = {}
        self._auto_match = auto_match
        self._max_skills = max_skills_per_stage
        self._fallback_matching = fallback_matching

        # Load builtin skills
        builtin = Path(builtin_dir) if builtin_dir else _BUILTIN_DIR
        self._load_from_dir(builtin)

        # Load custom skills
        for d in custom_dirs:
            self._load_from_dir(Path(d))

        # Load external skills (same mechanism)
        for d in external_dirs:
            self._load_from_dir(Path(d))

    def _load_from_dir(self, directory: Path) -> None:
        """Load skills from a directory and register them."""
        skills = load_skills_from_directory(directory)
        for skill in skills:
            self.register(skill)

    def register(self, skill: Skill) -> None:
        """Register a skill. Overwrites existing skill with same name.

        Args:
            skill: The skill to register.
        """
        self._skills[skill.name] = skill
        logger.debug("Registered skill: %s", skill.name)

    def unregister(self, skill_id: str) -> bool:
        """Remove a skill from the registry.

        Args:
            skill_id: The name/ID of the skill to remove.

        Returns:
            True if skill was found and removed.
        """
        if skill_id in self._skills:
            del self._skills[skill_id]
            return True
        return False

    def get(self, skill_id: str) -> Skill | None:
        """Get a skill by name/ID."""
        return self._skills.get(skill_id)

    def list_all(self) -> list[Skill]:
        """Return all registered skills."""
        return list(self._skills.values())

    def list_by_category(self, category: str) -> list[Skill]:
        """Return skills filtered by category."""
        return [s for s in self._skills.values() if s.category == category]

    def list_by_stage(self, stage: int) -> list[Skill]:
        """Return skills applicable to a specific stage."""
        return [
            s for s in self._skills.values()
            if not s.applicable_stages or stage in s.applicable_stages
        ]

    def match(
        self,
        context: str,
        stage: int | str,
        top_k: int | None = None,
    ) -> list[Skill]:
        """Match skills to current context and stage.

        Args:
            context: Task context text.
            stage: Current pipeline stage number or name.
            top_k: Max results (defaults to max_skills_per_stage).

        Returns:
            List of matched skills sorted by relevance.
        """
        k = top_k or self._max_skills
        return match_skills(
            list(self._skills.values()),
            context,
            stage,
            top_k=k,
            fallback_matching=self._fallback_matching,
        )

    def export_for_prompt(
        self,
        skills: list[Skill],
        max_chars: int = 4000,
    ) -> str:
        """Format matched skills as prompt injection text.

        Args:
            skills: List of matched skills.
            max_chars: Character limit.

        Returns:
            Formatted prompt text.
        """
        return format_skills_for_prompt(skills, max_chars=max_chars)

    def count(self) -> int:
        """Return total number of registered skills."""
        return len(self._skills)


================================================
FILE: researchclaw/skills/schema.py
================================================
"""Skill data model definition (agentskills.io compatible)."""

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Any


# Maps pipeline stage names to stage numbers.
STAGE_NAME_TO_NUMBER: dict[str, int] = {
    "topic_init": 1,
    "problem_decompose": 2,
    "search_strategy": 3,
    "literature_collect": 4,
    "literature_screen": 5,
    "knowledge_extract": 6,
    "synthesis": 7,
    "hypothesis_gen": 8,
    "experiment_design": 9,
    "code_generation": 10,
    "resource_planning": 11,
    "experiment_run": 12,
    "iterative_refine": 13,
    "result_analysis": 14,
    "research_decision": 15,
    "paper_outline": 16,
    "paper_draft": 17,
    "peer_review": 18,
    "paper_revision": 19,
    "quality_gate": 20,
    "knowledge_archive": 21,
    "export_publish": 22,
    "citation_verify": 23,
}

# Valid categories in the new taxonomy.
VALID_CATEGORIES = ("writing", "domain", "experiment", "tooling")


@dataclass
class Skill:
    """A single skill definition (agentskills.io compatible).

    Standard fields follow the agentskills.io specification.
    Legacy YAML fields are accessible via backward-compat properties
    that read from ``metadata``.
    """

    # agentskills.io standard fields
    name: str
    description: str
    body: str = ""
    license: str = ""
    compatibility: str = ""
    metadata: dict[str, str] = field(default_factory=dict)

    # filesystem context
    source_dir: Path | None = None
    source_format: str = "skillmd"  # "skillmd" | "yaml"

    # ── backward-compat property accessors ───────────────────────

    @property
    def id(self) -> str:  # noqa: A003
        """Alias for ``name`` (legacy)."""
        return self.name

    @property
    def category(self) -> str:
        return self.metadata.get("category", "domain")

    @property
    def trigger_keywords(self) -> list[str]:
        raw = self.metadata.get("trigger-keywords", "")
        return [k.strip() for k in raw.split(",") if k.strip()] if raw else []

    @property
    def applicable_stages(self) -> list[int]:
        raw = self.metadata.get("applicable-stages", "")
        if not raw:
            return []
        parts: list[int] = []
        for tok in raw.split(","):
            tok = tok.strip()
            if tok.isdigit():
                parts.append(int(tok))
        return parts

    @property
    def priority(self) -> int:
        return int(self.metadata.get("priority", "5"))

    @property
    def prompt_template(self) -> str:
        """Alias for ``body`` (legacy)."""
        return self.body

    @property
    def code_template(self) -> str | None:
        return self.metadata.get("code-template") or None

    @property
    def references(self) -> list[str]:
        raw = self.metadata.get("references", "")
        return [r.strip() for r in raw.split(";") if r.strip()] if raw else []

    @property
    def version(self) -> str:
        return self.metadata.get("version", "1.0")

    # ── serialization ────────────────────────────────────────────

    def to_dict(self) -> dict[str, Any]:
        """Serialize to dictionary (legacy-compatible output)."""
        return {
            "id": self.name,
            "name": self.name,
            "category": self.category,
            "description": self.description,
            "trigger_keywords": self.trigger_keywords,
            "applicable_stages": self.applicable_stages,
            "prompt_template": self.body,
            "code_template": self.code_template,
            "references": self.references,
            "version": self.version,
            "priority": self.priority,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Skill:
        """Deserialize from a legacy YAML/JSON dictionary."""
        # Pack legacy top-level fields into metadata
        meta: dict[str, str] = {}
        if data.get("category"):
            meta["category"] = str(data["category"])
        kw = data.get("trigger_keywords") or []
        if kw:
            meta["trigger-keywords"] = ",".join(str(k) for k in kw)
        stages = data.get("applicable_stages") or []
        if stages:
            meta["applicable-stages"] = ",".join(str(s) for s in stages)
        if data.get("priority") is not None:
            meta["priority"] = str(data["priority"])
        if data.get("version"):
            meta["version"] = str(data["version"])
        if data.get("code_template"):
            meta["code-template"] = str(data["code_template"])
        refs = data.get("references") or []
        if refs:
            meta["references"] = "; ".join(str(r) for r in refs)
        # Merge any explicit metadata from the dict
        if isinstance(data.get("metadata"), dict):
            for k, v in data["metadata"].items():
                meta.setdefault(str(k), str(v))

        name = str(data.get("name") or data.get("id") or "")
        # For legacy YAML, use 'id' if 'name' looks like a display name
        # and 'id' looks like a slug
        raw_id = str(data.get("id", ""))
        if raw_id and "-" in raw_id:
            name = raw_id

        return cls(
            name=name,
            description=str(data.get("description", "")),
            body=str(data.get("prompt_template", "")),
            metadata=meta,
            source_format="yaml",
        )


================================================
FILE: researchclaw/templates/__init__.py
================================================
"""Conference-aware LaTeX template system.

Supports automatic template switching for NeurIPS, ICLR, and ICML.
Given a target conference name, generates a complete ``.tex`` file from
Markdown paper content + BibTeX references.

Usage::

    from researchclaw.templates import get_template, markdown_to_latex

    tpl = get_template("neurips_2025")
    tex = markdown_to_latex(paper_md, tpl, title=..., authors=..., bib_file="references.bib")
"""

from researchclaw.templates.conference import (
    CONFERENCE_REGISTRY,
    ConferenceTemplate,
    get_template,
    list_conferences,
)
from researchclaw.templates.converter import markdown_to_latex

__all__ = [
    "CONFERENCE_REGISTRY",
    "ConferenceTemplate",
    "get_template",
    "list_conferences",
    "markdown_to_latex",
]


================================================
FILE: researchclaw/templates/compiler.py
================================================
"""LaTeX compilation and error repair utilities (IMP-18).

Provides ``compile_latex()`` which attempts ``pdflatex`` compilation,
parses the log for common errors, applies automated fixes, and retries
up to 3 times.  Designed to run inside ``_package_deliverables()`` so
that the final paper.tex in ``deliverables/`` is compile-tested.

If pdflatex is not installed the module gracefully returns a failure
report without raising.
"""

from __future__ import annotations

import logging
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass, field
from pathlib import Path

logger = logging.getLogger(__name__)

# BUG-201: Cyrillic → Latin transliteration for author names from Semantic Scholar.
# pdflatex without T2A font encoding chokes on Cyrillic (e.g. "А. И. Колесников").
_CYRILLIC_TO_LATIN_MAP: dict[str, str] = {
    "А": "A", "Б": "B", "В": "V", "Г": "G", "Д": "D", "Е": "E",
    "Ё": "E", "Ж": "Zh", "З": "Z", "И": "I", "Й": "Y", "К": "K",
    "Л": "L", "М": "M", "Н": "N", "О": "O", "П": "P", "Р": "R",
    "С": "S", "Т": "T", "У": "U", "Ф": "F", "Х": "Kh", "Ц": "Ts",
    "Ч": "Ch", "Ш": "Sh", "Щ": "Shch", "Ъ": "", "Ы": "Y", "Ь": "",
    "Э": "E", "Ю": "Yu", "Я": "Ya",
    "а": "a", "б": "b", "в": "v", "г": "g", "д": "d", "е": "e",
    "ё": "e", "ж": "zh", "з": "z", "и": "i", "й": "y", "к": "k",
    "л": "l", "м": "m", "н": "n", "о": "o", "п": "p", "р": "r",
    "с": "s", "т": "t", "у": "u", "ф": "f", "х": "kh", "ц": "ts",
    "ч": "ch", "ш": "sh", "щ": "shch", "ъ": "", "ы": "y", "ь": "",
    "э": "e", "ю": "yu", "я": "ya",
}


@dataclass
class CompileResult:
    """Outcome of a LaTeX compilation attempt."""

    success: bool
    log_excerpt: str = ""
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    fixes_applied: list[str] = field(default_factory=list)
    attempts: int = 0


def compile_latex(
    tex_path: Path,
    *,
    max_attempts: int = 3,
    timeout: int = 120,
) -> CompileResult:
    """Compile *tex_path* with pdflatex, auto-fixing common errors.

    Parameters
    ----------
    tex_path:
        Path to the ``.tex`` file.  Must be inside a directory that also
        contains ``references.bib`` and any required ``.sty`` files.
    max_attempts:
        Maximum compile→fix cycles.
    timeout:
        Seconds before killing a stuck pdflatex process.

    Returns
    -------
    CompileResult
        Contains success flag, log excerpt, errors found, and fixes applied.
    """
    if not shutil.which("pdflatex"):
        return CompileResult(
            success=False,
            log_excerpt="pdflatex not found on PATH",
            errors=["pdflatex not installed"],
        )

    result = CompileResult(success=False)
    work_dir = tex_path.parent
    tex_name = tex_path.name
    bib_stem = tex_name.rsplit(".", 1)[0]

    # Pre-flight: sanitize .bib file (escape bare & in field values)
    # Find bib filename from \bibliography{...} in the tex source
    _tex_src = tex_path.read_text(encoding="utf-8", errors="replace")
    _bib_match = re.search(r"\\bibliography\{([^}]+)\}", _tex_src)
    _bib_name = _bib_match.group(1) if _bib_match else bib_stem
    _sanitize_bib_file(work_dir / f"{_bib_name}.bib")

    # BUG-197: Pre-flight — strip invisible/problematic Unicode from .tex.
    # Characters like U+202F (NARROW NO-BREAK SPACE) cause pdflatex to emit
    # broken UTF-8 in error messages, which crashes subprocess text decoding
    # and prevents the bibtex + multi-pass pipeline from completing.
    _sanitize_tex_unicode(tex_path)

    for attempt in range(1, max_attempts + 1):
        result.attempts = attempt

        # --- Full 3-pass compilation: pdflatex → bibtex → pdflatex × 2 ---
        # Pass 1: generate .aux (needed by bibtex). Use nonstopmode (NOT
        # halt-on-error) so .aux is written even when there are non-fatal
        # errors like missing figures or overfull hboxes.
        log_text, pass1_ok = _run_pdflatex(work_dir, tex_name, timeout)
        if log_text is None:
            result.errors.append(f"pdflatex failed on pass 1 (attempt {attempt})")
            break

        # BibTeX: always run after pass 1 — it only needs .aux + .bib.
        # Previously gated behind pass1 success, which meant citations were
        # always [?] when the first pass had non-fatal errors.
        _run_bibtex(work_dir, bib_stem, timeout=60)

        # Passes 2-3: resolve cross-references and bibliography
        for _pass in (2, 3):
            pass_log, _ = _run_pdflatex(work_dir, tex_name, timeout)
            if pass_log is not None:
                log_text = pass_log  # keep final pass log for error analysis

        # Parse the final log for errors/warnings
        errors, warnings = _parse_log(log_text)
        result.warnings = warnings
        result.log_excerpt = log_text[-2000:] if len(log_text) > 2000 else log_text

        # Check for fatal errors only — non-fatal ones (overfull hbox,
        # missing figure in draft) don't prevent a valid PDF.
        fatal = [e for e in errors if _is_fatal_error(e)]
        result.errors = errors

        if not fatal:
            result.success = True
            logger.info("IMP-18: LaTeX compiled successfully on attempt %d", attempt)
            break

        # Try to auto-fix fatal errors
        tex_text = tex_path.read_text(encoding="utf-8")
        fixed_text, fixes = fix_common_latex_errors(tex_text, errors)
        if fixes:
            result.fixes_applied.extend(fixes)
            tex_path.write_text(fixed_text, encoding="utf-8")
            logger.info(
                "IMP-18: Applied %d fixes on attempt %d: %s",
                len(fixes),
                attempt,
                fixes,
            )
        else:
            # No fixes available — stop retrying
            logger.warning(
                "IMP-18: Compilation failed on attempt %d with %d unfixable errors",
                attempt,
                len(fatal),
            )
            break

    return result


def fix_common_latex_errors(
    tex_text: str, errors: list[str]
) -> tuple[str, list[str]]:
    """Apply automated fixes for common LaTeX errors.

    Returns ``(fixed_text, list_of_fix_descriptions)``.
    """
    fixes: list[str] = []
    fixed = tex_text

    # --- Pre-error-loop fixes: structural repairs that prevent compilation ---

    # Fix escaped braces in tabular column specs: \{lcccc\} → {lcccc}
    if re.search(r"\\begin\{tabular\}\\\{", fixed):
        fixed = re.sub(
            r"\\begin\{tabular\}\\\{([^}]*?)\\\}",
            r"\\begin{tabular}{\1}",
            fixed,
        )
        fixes.append("Fixed escaped braces in tabular column specs")

    # Fix escaped & inside tabular data rows: \& → & (column separator).
    # The converter's _escape_latex escapes & globally; inside tabular
    # environments the & must remain unescaped as the column separator.
    if "\\begin{tabular}" in fixed and "\\&" in fixed:
        fixed, n_tab_amp = _fix_escaped_ampersand_in_tabular(fixed)
        if n_tab_amp:
            fixes.append(f"Un-escaped \\& in {n_tab_amp} tabular data row(s)")

    # Fix escaped \} at end of \caption{...}: \caption{text.\}} → \caption{text.}
    if re.search(r"\\caption\{.*?\\\}", fixed):
        fixed = re.sub(
            r"(\\caption\{[^}]*?)\\\}",
            r"\1}",
            fixed,
        )
        fixes.append("Fixed escaped \\} in \\caption arguments")

    # Collapse multiple consecutive \clearpage into one
    if re.search(r"(\\clearpage\s*){2,}", fixed):
        fixed = re.sub(r"(\\clearpage\s*){2,}", "\\\\clearpage\n", fixed)
        fixes.append("Collapsed multiple \\clearpage commands")

    # Remove \textbf{Figure N.} paragraphs that follow \end{figure}
    dup_cap = re.search(
        r"(\\end\{figure\})\s*\n\s*\\textbf\{Figure\s+\d+",
        fixed,
    )
    if dup_cap:
        fixed = re.sub(
            r"(\\end\{figure\})\s*\n\s*\\textbf\{Figure\s+\d+[.:].*?\}\s*\n",
            r"\1\n",
            fixed,
        )
        fixes.append("Removed duplicate bold Figure captions after \\end{figure}")

    # BUG-189: Fix Python-style pseudocode inside algorithmic environments.
    # LLM generates `# comment` (LaTeX macro param char) and `var_name`
    # (unescaped underscore) inside \STATE commands — causes cascading errors.
    _algo_pat = re.compile(
        r"(\\begin\{algorithmic\}.*?\\end\{algorithmic\})", re.DOTALL
    )
    def _fix_algo_block(m: re.Match) -> str:
        block = m.group(0)
        lines = block.split("\n")
        out: list[str] = []
        for line in lines:
            if line.strip().startswith(("\\begin{", "\\end{")):
                out.append(line)
                continue
            # Replace # (Python comment) with \COMMENT{...}
            if "#" in line and "\\#" not in line:
                line = re.sub(r"#\s*(.*)$", r"\\COMMENT{\1}", line)
            # Escape bare underscores not already in math mode
            # Don't touch \STATE, \IF, \FOR, etc. commands
            parts = re.split(r"(\\\w+\{[^}]*\}|\$[^$]+\$)", line)
            fixed_parts = []
            for part in parts:
                if part.startswith("\\") or part.startswith("$"):
                    fixed_parts.append(part)
                else:
                    fixed_parts.append(re.sub(r'(?<!\\)_', r'\\_', part))
            line = "".join(fixed_parts)
            out.append(line)
        return "\n".join(out)

    if _algo_pat.search(fixed):
        fixed = _algo_pat.sub(_fix_algo_block, fixed)
        fixes.append("Fixed Python-style pseudocode in algorithmic environment")

    for err in errors:
        err_lower = err.lower()

        # Undefined control sequence: remove the offending command
        if "undefined control sequence" in err_lower:
            # Extract the command name from error like "! Undefined control sequence. \foo"
            m = re.search(r"\\([a-zA-Z]+)", err)
            if m:
                cmd = m.group(1)
                # Don't remove standard commands
                _safe_to_remove = {
                    "textsc", "textsl", "mathbb", "mathcal",
                    "bm", "boldsymbol",
                }
                if cmd in _safe_to_remove:
                    # Replace \cmd{text} → text
                    fixed = re.sub(
                        rf"\\{cmd}\{{([^}}]*)\}}", r"\1", fixed
                    )
                    fixes.append(f"Removed undefined \\{cmd}")

        # Missing $ inserted — likely double-escaped underscore \\_
        if "missing $ inserted" in err_lower:
            # BUG-182: Collapse double-escaped underscores \\_ to \_
            if "\\\\_" in fixed:
                fixed = fixed.replace("\\\\_", "\\_")
                fixes.append("Collapsed double-escaped underscores")
            # Also fix bare underscores outside math mode
            # (conservative — only in obvious identifier patterns)
            fixed = re.sub(
                r"(?<!\\)(?<!\$)_(?=[A-Za-z])", r"\\_", fixed
            )
            if fixed != tex_text:
                fixes.append("Escaped bare underscores outside math")

        # \tilde outside math mode — classify as non-fatal, PDF still generates

        # Encoding error for \k (Polish ogonek) → remove
        if "\\k unavailable" in err_lower or "command \\k" in err_lower:
            fixed = re.sub(r"\\k\{([^}]*)\}", r"\1", fixed)
            fixed = re.sub(r"\\k\b", "", fixed)
            fixes.append("Removed unsupported \\k command")

        # BUG-197: Unicode character "not set up for use with LaTeX"
        # Extract the hex codepoint and replace all instances in the tex.
        # The error line is "! LaTeX Error: Unicode character X (U+XXXX)".
        if "unicode character" in err_lower and "(u+" in err_lower:
            cp_match = re.search(r"\(U\+([0-9A-Fa-f]{4,})\)", err)
            if cp_match:
                cp = int(cp_match.group(1), 16)
                char = chr(cp)
                if char in fixed:
                    # Whitespace-like → ASCII space; others → remove
                    import unicodedata
                    cat = unicodedata.category(char)
                    replacement = " " if cat.startswith("Z") else ""
                    fixed = fixed.replace(char, replacement)
                    fixes.append(
                        f"Replaced Unicode U+{cp_match.group(1)} "
                        f"({'space' if replacement == ' ' else 'removed'})"
                    )

        # File not found
        if "file" in err_lower and "not found" in err_lower:
            m = re.search(r"File `([^']+)' not found", err)
            if m:
                missing_file = m.group(1)
                if missing_file.endswith(".sty"):
                    # Comment out the usepackage line
                    pkg = missing_file.replace(".sty", "")
                    fixed = re.sub(
                        rf"\\usepackage(\[[^\]]*\])?\{{{pkg}\}}",
                        f"% IMP-18: Removed missing package {pkg}",
                        fixed,
                    )
                    fixes.append(f"Removed missing package {pkg}")

        # Too many unprocessed floats / Float(s) lost
        if "too many unprocessed floats" in err_lower or "float(s) lost" in err_lower:
            # BUG-109 fix: Add \extrafloats and \clearpage for float overflow
            if "\\extrafloats" not in fixed:
                fixed = fixed.replace(
                    "\\begin{document}",
                    "\\begin{document}\n\\extrafloats{200}",
                )
                fixes.append("Added \\extrafloats{200} for float overflow")
            # BUG-109b: \textwidth in 2-column causes oversized floats to be lost
            if "\\resizebox{\\textwidth}" in fixed:
                fixed = fixed.replace(
                    "\\resizebox{\\textwidth}",
                    "\\resizebox{\\columnwidth}",
                )
                fixes.append("Replaced \\textwidth with \\columnwidth in resizebox")
            # Relax float placement from [ht] or [t] to [htbp!]
            fixed = re.sub(
                r"\\begin\{(table|figure)\}\[h?t\]",
                r"\\begin{\1}[htbp!]",
                fixed,
            )
            fixes.append("Relaxed float placement to [htbp!]")
            # Add \clearpage before first table as last resort
            fixed = fixed.replace(
                "\\begin{table}",
                "\\clearpage\n\\begin{table}",
                1,
            )
            fixes.append("Added \\clearpage for float overflow")

        # Misplaced alignment tab &
        if "misplaced alignment tab" in err_lower:
            # Usually from & outside tabular — escape stray &
            pass  # Hard to auto-fix without context

    return fixed, fixes


def _parse_log(log_text: str) -> tuple[list[str], list[str]]:
    """Parse pdflatex log output for errors and warnings."""
    errors: list[str] = []
    warnings: list[str] = []

    for line in log_text.split("\n"):
        line_stripped = line.strip()
        line_lower = line_stripped.lower()
        if line_stripped.startswith("!"):
            errors.append(line_stripped)
        elif "LaTeX Warning:" in line_stripped:
            warnings.append(line_stripped)
        # BUG-R6-26: Use elif to avoid duplicating "!" lines
        elif "Undefined control sequence" in line_stripped:
            errors.append(line_stripped)
        elif "Missing" in line_stripped and "inserted" in line_stripped:
            errors.append(line_stripped)
        elif "File" in line_stripped and "not found" in line_stripped:
            errors.append(line_stripped)
        # BUG-R6-21: Detect "Float(s) lost" and "Too many unprocessed floats"
        # even when they don't start with "!"
        elif "float(s) lost" in line_lower:
            errors.append(line_stripped)
        elif "too many unprocessed floats" in line_lower:
            errors.append(line_stripped)

    return errors, warnings


@dataclass
class QualityCheckResult:
    """Results of post-compilation quality checks."""

    unresolved_refs: list[str] = field(default_factory=list)
    unresolved_cites: list[str] = field(default_factory=list)
    overfull_hboxes: list[str] = field(default_factory=list)
    underfull_hboxes: list[str] = field(default_factory=list)
    page_count: int = 0
    orphan_figures: list[str] = field(default_factory=list)
    orphan_labels: list[str] = field(default_factory=list)
    warnings_summary: list[str] = field(default_factory=list)

    @property
    def has_critical_issues(self) -> bool:
        return bool(self.unresolved_refs or self.unresolved_cites)


def check_compiled_quality(
    tex_path: Path,
    *,
    page_limit: int = 10,
) -> QualityCheckResult:
    """Run post-compilation quality checks on a LaTeX document.

    Parses the .log file and .tex source for:
    - Unresolved references (??)
    - Unresolved citations
    - Overfull/underfull hboxes
    - Page count vs limit
    - Orphan figures (defined but never referenced, or vice versa)
    """
    result = QualityCheckResult()
    work_dir = tex_path.parent
    stem = tex_path.stem

    # --- Parse .log file ---
    log_path = work_dir / f"{stem}.log"
    if log_path.exists():
        log_text = log_path.read_text(encoding="utf-8", errors="replace")
        for line in log_text.split("\n"):
            line_s = line.strip()
            # Unresolved references
            if "LaTeX Warning: Reference" in line_s and "undefined" in line_s:
                result.unresolved_refs.append(line_s)
            # Unresolved citations
            if "LaTeX Warning: Citation" in line_s and "undefined" in line_s:
                result.unresolved_cites.append(line_s)
            # Overfull hboxes (only flag significant ones > 1pt)
            if "Overfull \\hbox" in line_s:
                m = re.search(r"(\d+\.?\d*)pt", line_s)
                if m and float(m.group(1)) > 1.0:
                    result.overfull_hboxes.append(line_s)
            # Underfull hboxes (badness >= 5000)
            if "Underfull \\hbox" in line_s and "badness" in line_s:
                m = re.search(r"badness (\d+)", line_s)
                if m and int(m.group(1)) >= 5000:
                    result.underfull_hboxes.append(line_s)

    # --- Count pages from .aux or .log ---
    aux_path = work_dir / f"{stem}.aux"
    if aux_path.exists():
        aux_text = aux_path.read_text(encoding="utf-8", errors="replace")
        # Look for \newlabel{LastPage}{{N}{...}}
        m = re.search(r"\\newlabel\{LastPage\}\{\{(\d+)\}", aux_text)
        if m:
            result.page_count = int(m.group(1))
    if result.page_count == 0 and log_path.exists():
        # Fallback: count "Output written on ... (N pages)"
        m = re.search(r"Output written on .* \((\d+) page", log_text)
        if m:
            result.page_count = int(m.group(1))

    # --- Cross-reference validation ---
    tex_text = tex_path.read_text(encoding="utf-8", errors="replace")
    # Find all \label{fig:X}
    fig_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", tex_text))
    # Find all \ref{fig:X}
    fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", tex_text))
    # Orphan labels (defined but never referenced)
    result.orphan_labels = sorted(fig_labels - fig_refs)
    # Orphan references (referenced but never defined)
    result.orphan_figures = sorted(fig_refs - fig_labels)

    # --- Build warnings summary ---
    if result.unresolved_refs:
        result.warnings_summary.append(
            f"{len(result.unresolved_refs)} unresolved reference(s)"
        )
    if result.unresolved_cites:
        result.warnings_summary.append(
            f"{len(result.unresolved_cites)} unresolved citation(s)"
        )
    if result.overfull_hboxes:
        result.warnings_summary.append(
            f"{len(result.overfull_hboxes)} overfull hbox(es) > 1pt"
        )
    if result.page_count > page_limit:
        result.warnings_summary.append(
            f"Page count {result.page_count} exceeds limit {page_limit}"
        )
    if result.orphan_figures:
        result.warnings_summary.append(
            f"{len(result.orphan_figures)} referenced but undefined figure(s): "
            + ", ".join(result.orphan_figures[:3])
        )
    if result.orphan_labels:
        result.warnings_summary.append(
            f"{len(result.orphan_labels)} defined but unreferenced figure(s): "
            + ", ".join(result.orphan_labels[:3])
        )

    return result


def remove_missing_figures(tex_text: str, stage_dir: Path) -> tuple[str, list[str]]:
    """Remove \\begin{figure}...\\end{figure} blocks that reference missing images.

    Returns ``(fixed_text, list_of_removed_paths)``.
    """
    removed: list[str] = []

    def _check_fig(m: re.Match) -> str:
        block = m.group(0)
        img_match = re.search(r"\\includegraphics.*?\{([^}]+)\}", block)
        if img_match:
            img_rel = img_match.group(1)
            img_path = stage_dir / img_rel
            if not img_path.exists():
                # Try prefix-matching: fig_main_results.png → fig_main_results_comparison.png
                parent = img_path.parent
                stem = img_path.stem  # e.g. "fig_main_results"
                if parent.exists():
                    candidates = sorted(parent.glob(f"{stem}*.png"))
                    if len(candidates) == 1:
                        new_rel = str(candidates[0].relative_to(stage_dir))
                        logger.info(
                            "Auto-mapped missing figure: %s → %s",
                            img_rel, new_rel,
                        )
                        return block.replace(img_rel, new_rel)
                logger.warning(
                    "Removing figure block with missing image: %s",
                    img_rel,
                )
                removed.append(img_rel)
                return ""  # Remove the entire figure block
        return block

    fixed = re.sub(
        r"\\begin\{figure\}.*?\\end\{figure\}",
        _check_fig,
        tex_text,
        flags=re.DOTALL,
    )

    # Clean up orphan \ref{fig:X} that point to removed/nonexistent figures.
    # These render as "??" in the PDF.
    if removed:
        remaining_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", fixed))
        all_fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", fixed))
        orphan = all_fig_refs - remaining_labels
        for oref in orphan:
            # Replace "Figure \ref{fig:X}" or "Fig. \ref{fig:X}" with empty
            fixed = re.sub(
                rf"(?:Figure|Fig\.?)\s*~?\\ref\{{{re.escape(oref)}\}}",
                "(figure omitted)",
                fixed,
            )
            # Replace standalone \ref{fig:X}
            fixed = fixed.replace(f"\\ref{{{oref}}}", "(ref omitted)")

    return fixed, removed


def _sanitize_tex_unicode(tex_path: Path) -> None:
    """Strip problematic Unicode characters from .tex source.

    BUG-197: Characters like U+202F (NARROW NO-BREAK SPACE), U+2009 (THIN
    SPACE), U+00A0 (NO-BREAK SPACE), and other non-ASCII whitespace cause
    pdflatex to emit broken UTF-8 in error messages, which crashes Python's
    ``subprocess.run(text=True)`` and prevents the bibtex + multi-pass
    pipeline from completing.  These characters appear when LLMs copy-paste
    text from web sources or academic papers.

    The safe replacement is a normal ASCII space for whitespace-like chars,
    and empty string for invisible control chars.
    """
    if not tex_path.exists():
        return
    try:
        text = tex_path.read_text(encoding="utf-8", errors="replace")
    except Exception:
        return

    # Whitespace-like Unicode → ASCII space
    _UNICODE_SPACES = (
        "\u00a0",  # NO-BREAK SPACE
        "\u202f",  # NARROW NO-BREAK SPACE (BUG-197 trigger)
        "\u2009",  # THIN SPACE
        "\u2007",  # FIGURE SPACE
        "\u2008",  # PUNCTUATION SPACE
        "\u200a",  # HAIR SPACE
        "\u205f",  # MEDIUM MATHEMATICAL SPACE
        "\u3000",  # IDEOGRAPHIC SPACE
    )
    # Invisible control characters → remove
    _INVISIBLE_CHARS = (
        "\u200e",  # LEFT-TO-RIGHT MARK
        "\u200f",  # RIGHT-TO-LEFT MARK
        "\ufeff",  # BOM / ZERO-WIDTH NO-BREAK SPACE
        "\u200b",  # ZERO-WIDTH SPACE
        "\u200c",  # ZERO-WIDTH NON-JOINER
        "\u200d",  # ZERO-WIDTH JOINER
        "\u00ad",  # SOFT HYPHEN
        "\u2060",  # WORD JOINER
        "\u2028",  # LINE SEPARATOR
        "\u2029",  # PARAGRAPH SEPARATOR
    )

    changed = False
    for ch in _UNICODE_SPACES:
        if ch in text:
            text = text.replace(ch, " ")
            changed = True
    for ch in _INVISIBLE_CHARS:
        if ch in text:
            text = text.replace(ch, "")
            changed = True

    # BUG-201: Transliterate any Cyrillic that leaked into .tex (from bib
    # entries inlined by bibtex, or from LLM-generated text).
    _has_cyrillic = any("\u0400" <= ch <= "\u04ff" for ch in text)
    if _has_cyrillic:
        for cyr, lat in _CYRILLIC_TO_LATIN_MAP.items():
            if cyr in text:
                text = text.replace(cyr, lat)
        changed = True

    if changed:
        tex_path.write_text(text, encoding="utf-8")
        logger.info("BUG-197: Sanitized problematic Unicode in %s", tex_path.name)


def _sanitize_bib_file(bib_path: Path) -> None:
    """Sanitize .bib files: escape bare ``&`` and strip invisible Unicode.

    BibTeX treats ``&`` as a special character; journal names like
    "Science & Technology" must use ``\\&``.

    BUG-180: Invisible Unicode characters (U+200E LEFT-TO-RIGHT MARK,
    U+200F RIGHT-TO-LEFT MARK, U+FEFF BOM, U+200B ZERO-WIDTH SPACE,
    U+200C/U+200D joiners, U+00AD soft hyphen) can appear in
    copy-pasted author names and break pdflatex.
    """
    if not bib_path.exists():
        return
    try:
        text = bib_path.read_text(encoding="utf-8")
    except Exception:
        return

    # BUG-180: Strip invisible Unicode characters
    _INVISIBLE_CHARS = (
        "\u200e",  # LEFT-TO-RIGHT MARK
        "\u200f",  # RIGHT-TO-LEFT MARK
        "\ufeff",  # BOM / ZERO-WIDTH NO-BREAK SPACE
        "\u200b",  # ZERO-WIDTH SPACE
        "\u200c",  # ZERO-WIDTH NON-JOINER
        "\u200d",  # ZERO-WIDTH JOINER
        "\u00ad",  # SOFT HYPHEN
        "\u2060",  # WORD JOINER
        "\u2028",  # LINE SEPARATOR
        "\u2029",  # PARAGRAPH SEPARATOR
    )
    for ch in _INVISIBLE_CHARS:
        if ch in text:
            text = text.replace(ch, "")

    # BUG-201: Transliterate Cyrillic characters to Latin equivalents.
    # Russian author names (e.g. "А. И. Колесников") from Semantic Scholar
    # cause "! LaTeX Error: Unicode character" when pdflatex runs without T2A
    # font encoding.  Transliterating preserves name readability.
    _orig_text = text
    for cyr, lat in _CYRILLIC_TO_LATIN_MAP.items():
        if cyr in text:
            text = text.replace(cyr, lat)

    # BUG-217: Strip literal escape sequences (\n, \r, \t) in bib field values.
    # These appear when API responses embed Python-style escapes into titles.
    # A literal `\n` is never a valid BibTeX/LaTeX command and causes
    # "Undefined control sequence" errors during compilation.
    text = re.sub(r"\\n(?=\s)", " ", text)
    text = re.sub(r"\\r(?=\s)", "", text)
    text = re.sub(r"\\t(?=\s)", " ", text)

    lines = text.split("\n")
    changed = text != _orig_text
    for i, line in enumerate(lines):
        stripped = line.strip()
        # Only fix field-value lines (e.g.  journal = {Science & Technology},)
        # Skip @type{ lines, key lines, and URL/DOI fields (BUG-DA8-12)
        if "=" in stripped and "{" in stripped and "&" in stripped and "\\&" not in stripped:
            _field_name = stripped.split("=", 1)[0].strip().lower()
            if _field_name in ("url", "doi", "howpublished", "eprint"):
                continue  # Don't escape & in URLs
            lines[i] = line.replace("&", "\\&")
            changed = True

    new_text = "\n".join(lines)
    if new_text != text or changed:
        bib_path.write_text(new_text, encoding="utf-8")
        logger.info("Sanitized bib file %s", bib_path.name)


def _fix_escaped_ampersand_in_tabular(tex: str) -> tuple[str, int]:
    """Replace ``\\&`` with ``&`` inside tabular environments.

    Only touches data rows (between \\toprule/\\midrule/\\bottomrule)
    to avoid corrupting ``\\&`` in regular text.  Returns the fixed text
    and the count of rows fixed.
    """
    count = 0

    def _fix_tabular(m: re.Match[str]) -> str:
        nonlocal count
        block = m.group(0)
        if "\\&" not in block:
            return block
        # Only un-escape \& on lines that look like data rows (contain \\)
        lines = block.split("\n")
        for i, line in enumerate(lines):
            if "\\&" in line and "\\\\" in line:
                lines[i] = line.replace("\\&", "&")
                count += 1
        return "\n".join(lines)

    tex = re.sub(
        r"\\begin\{tabular\}.*?\\end\{tabular\}",
        _fix_tabular,
        tex,
        flags=re.DOTALL,
    )
    return tex, count


def _run_pdflatex(
    work_dir: Path,
    tex_name: str,
    timeout: int = 120,
) -> tuple[str | None, bool]:
    """Run a single pdflatex pass with ``-interaction=nonstopmode``.

    Returns ``(log_text, success)``.  *log_text* is ``None`` only on
    hard failures (timeout, binary missing).

    BUG-197: Uses bytes mode with manual UTF-8 decoding (errors="replace")
    instead of ``text=True``.  pdflatex stdout can contain broken UTF-8
    sequences (e.g. from U+202F NARROW NO-BREAK SPACE error messages),
    which cause ``UnicodeDecodeError`` with ``text=True`` and kill the
    entire compilation pipeline — bibtex never runs, all citations [?].
    """
    try:
        proc = subprocess.run(
            ["pdflatex", "-interaction=nonstopmode", tex_name],
            cwd=work_dir,
            capture_output=True,
            timeout=timeout,
        )
    except subprocess.TimeoutExpired:
        logger.warning("pdflatex timed out after %ds", timeout)
        return None, False
    except FileNotFoundError:
        return None, False
    stdout = proc.stdout.decode("utf-8", errors="replace")
    stderr = proc.stderr.decode("utf-8", errors="replace")
    log_text = stdout + "\n" + stderr
    return log_text, proc.returncode == 0


# Fatal error patterns — these prevent a valid PDF from being generated.
# Non-fatal issues (overfull hbox, missing figure, float warnings) still
# produce a usable PDF and should NOT trigger the auto-fix retry loop.
_FATAL_ERROR_PATTERNS = [
    "runaway argument",
    "emergency stop",
    "fatal error",
    "undefined control sequence",
    "missing $ inserted",
    "extra alignment tab",
    "misplaced alignment tab",
    "missing \\begin{document}",
    "file `" ,  # file not found (sty, cls)
    "file not found",
]


def _is_fatal_error(err: str) -> bool:
    """Return True if *err* represents a fatal LaTeX error."""
    err_lower = err.lower()
    # "!" prefix errors are almost always fatal
    if err.startswith("!"):
        # Non-fatal "!" errors — PDF is still generated
        if "overfull" in err_lower or "underfull" in err_lower:
            return False
        if "float(s) lost" in err_lower:
            return False
        if "too many unprocessed floats" in err_lower:
            return False
        # amsmath commands outside math mode — PDF still generates
        if "allowed only in math mode" in err_lower:
            return False
        # Encoding errors for special characters — PDF still generates
        if "unavailable in encoding" in err_lower:
            return False
        # BUG-197: Unicode character errors (e.g. U+202F NARROW NO-BREAK
        # SPACE "not set up for use with LaTeX") — pdflatex skips the
        # character and generates a valid PDF.  Treating these as fatal
        # prevents the retry loop from succeeding and blocks bibtex.
        # The error line is "! LaTeX Error: Unicode character X (U+XXXX)"
        # — the "not set up" text is on a continuation line.
        if "unicode character" in err_lower:
            return False
        return True
    for pat in _FATAL_ERROR_PATTERNS:
        if pat in err_lower:
            return True
    return False


def _run_bibtex(work_dir: Path, stem: str, timeout: int = 60) -> bool:
    """Run bibtex if the binary exists.  Returns True on success.

    BUG-197: Uses bytes mode with manual UTF-8 decoding (errors="replace")
    to avoid ``UnicodeDecodeError`` from non-ASCII bib content.  Logs
    failures so that silent bibtex issues are diagnosable.
    """
    if not shutil.which("bibtex"):
        logger.warning("bibtex not found on PATH — citations will be [?]")
        return False
    try:
        proc = subprocess.run(
            ["bibtex", stem],
            cwd=work_dir,
            capture_output=True,
            timeout=timeout,
        )
        stdout = proc.stdout.decode("utf-8", errors="replace")
        stderr = proc.stderr.decode("utf-8", errors="replace")
        if proc.returncode != 0:
            logger.warning(
                "bibtex returned %d: %s",
                proc.returncode,
                (stdout + stderr).strip()[:500],
            )
            return False
        # Log bibtex output at debug level for diagnostics
        if stdout.strip():
            logger.debug("bibtex output: %s", stdout.strip()[:300])
        # Verify .bbl was actually generated
        bbl_path = work_dir / f"{stem}.bbl"
        if not bbl_path.exists():
            logger.warning("bibtex ran but %s.bbl was not generated", stem)
            return False
        return True
    except subprocess.TimeoutExpired:
        logger.warning("bibtex timed out after %ds", timeout)
        return False
    except FileNotFoundError:
        return False


================================================
FILE: researchclaw/templates/conference.py
================================================
"""Conference template definitions for NeurIPS, ICLR, and ICML.

Each template stores the LaTeX preamble, document structure, author format,
and bibliography style needed to produce a submission-ready ``.tex`` file.

Style files (``.sty``) are NOT bundled — the generated ``.tex`` references
them, and users download the official files from the conference website.
Download URLs are included as comments in the output.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

# Root directory for bundled style files
_STYLES_DIR = Path(__file__).parent / "styles"


@dataclass(frozen=True)
class ConferenceTemplate:
    """LaTeX template specification for one conference."""

    name: str
    display_name: str
    year: int
    document_class: str
    style_package: str
    style_options: str
    extra_packages: tuple[str, ...]
    author_format: str  # "neurips" | "iclr" | "icml"
    bib_style: str
    columns: int  # 1 or 2
    style_download_url: str
    preamble_extra: str = ""

    def render_preamble(
        self,
        title: str,
        authors: str,
        abstract: str,
    ) -> str:
        # Style options (e.g. "preprint") go on the style package, not documentclass
        options = f"[{self.style_options}]" if self.style_options else ""
        pkg_lines = "\n".join(f"\\usepackage{{{p}}}" for p in self.extra_packages)

        author_block = self._render_authors(authors)

        # Substitute __TITLE__ placeholder in preamble_extra (e.g. ICML running title)
        preamble_extra = self.preamble_extra.replace("__TITLE__", title)

        style_line = (
            f"\\usepackage{options}{{{self.style_package}}}\n"
            if self.style_package
            else ""
        )
        style_comment = (
            f"% Style file: {self.style_download_url}\n"
            if self.style_download_url
            else ""
        )

        # BUG-51 fix: ICML's \begin{icmlauthorlist} is an environment that
        # must appear AFTER \begin{document}.  For non-ICML templates the
        # \author{} command is a preamble declaration and stays before.
        if self.author_format == "icml":
            preamble_author = ""
            post_doc_author = f"{author_block}\n\n"
        else:
            preamble_author = f"{author_block}\n"
            post_doc_author = ""

        return (
            f"{style_comment}"
            f"\\documentclass{{{self.document_class}}}\n"
            f"{style_line}"
            f"{pkg_lines}\n"
            f"{preamble_extra}\n"
            f"\n"
            f"\\title{{{title}}}\n"
            f"\n"
            f"{preamble_author}"
            f"\n"
            f"\\begin{{document}}\n"
            f"{post_doc_author}"
            f"\\begin{{abstract}}\n"
            f"{abstract}\n"
            f"\\end{{abstract}}\n"
            f"\n"
            f"\\maketitle\n"
        )

    def render_footer(self, bib_file: str = "references") -> str:
        return (
            f"\n\\bibliographystyle{{{self.bib_style}}}\n"
            f"\\bibliography{{{bib_file}}}\n"
            f"\n"
            f"\\end{{document}}\n"
        )

    def get_style_files(self) -> list[Path]:
        """Return paths to bundled ``.sty`` and ``.bst`` files for this template.

        Files are stored under ``researchclaw/templates/styles/<name>/``.
        Returns only files that exist on disk.
        """
        style_dir = _STYLES_DIR / self.name
        if not style_dir.is_dir():
            return []
        return sorted(
            p for p in style_dir.iterdir()
            if p.suffix in {".sty", ".bst", ".cls"}
        )

    def _render_authors(self, authors: str) -> str:
        if self.author_format == "icml":
            return (
                f"\\begin{{icmlauthorlist}}\n"
                f"\\icmlauthor{{{authors}}}{{aff1}}\n"
                f"\\end{{icmlauthorlist}}\n"
                f"\\icmlaffiliation{{aff1}}{{Affiliation}}"
            )
        return f"\\author{{{authors}}}"


# ---------------------------------------------------------------------------
# Template definitions
# ---------------------------------------------------------------------------

# -- Legacy (kept for backward compat) --

NEURIPS_2024 = ConferenceTemplate(
    name="neurips_2024",
    display_name="NeurIPS 2024",
    year=2024,
    document_class="article",
    style_package="neurips_2024",
    style_options="preprint",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
    ),
    author_format="neurips",
    bib_style="plainnat",
    columns=1,
    style_download_url="https://media.neurips.cc/Conferences/NeurIPS2024/Styles.zip",
    preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}",
)

ICLR_2025 = ConferenceTemplate(
    name="iclr_2025",
    display_name="ICLR 2025",
    year=2025,
    document_class="article",
    style_package="iclr2025_conference",
    style_options="",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
    ),
    author_format="iclr",
    bib_style="iclr2025_conference",
    columns=1,
    style_download_url="https://github.com/ICLR/Master-Template/raw/master/iclr2025.zip",
)

ICML_2025 = ConferenceTemplate(
    name="icml_2025",
    display_name="ICML 2025",
    year=2025,
    document_class="article",
    style_package="icml2025",
    style_options="",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
    ),
    author_format="icml",
    bib_style="icml2025",
    columns=2,
    style_download_url="https://icml.cc/Conferences/2025/StyleAuthorInstructions",
    preamble_extra="\\icmltitlerunning{__TITLE__}",
)

# -- Current (2025/2026) --

NEURIPS_2025 = ConferenceTemplate(
    name="neurips_2025",
    display_name="NeurIPS 2025",
    year=2025,
    document_class="article",
    style_package="neurips_2025",
    style_options="preprint",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
    ),
    author_format="neurips",
    bib_style="plainnat",
    columns=1,
    style_download_url="https://media.neurips.cc/Conferences/NeurIPS2025/Styles.zip",
    preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}",
)

ICLR_2026 = ConferenceTemplate(
    name="iclr_2026",
    display_name="ICLR 2026",
    year=2026,
    document_class="article",
    style_package="iclr2026_conference",
    style_options="",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
    ),
    author_format="iclr",
    bib_style="iclr2026_conference",
    columns=1,
    style_download_url="https://github.com/ICLR/Master-Template",
)

ICML_2026 = ConferenceTemplate(
    name="icml_2026",
    display_name="ICML 2026",
    year=2026,
    document_class="article",
    style_package="icml2026",
    style_options="",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "nicefrac",
        "microtype",
        "graphicx",
        "natbib",
        "algorithm",
        "algorithmic",
        "adjustbox",
        "morefloats",
    ),
    author_format="icml",
    bib_style="icml2026",
    columns=2,
    style_download_url="https://icml.cc/Conferences/2026/AuthorInstructions",
    preamble_extra="\\icmltitlerunning{__TITLE__}",
)

# -- Generic (non-ML) --

GENERIC = ConferenceTemplate(
    name="generic",
    display_name="Generic Academic Paper",
    year=2025,
    document_class="article",
    style_package="",
    style_options="",
    extra_packages=(
        "hyperref",
        "url",
        "booktabs",
        "amsfonts",
        "amsmath",
        "graphicx",
        "natbib",
        "geometry",
        "adjustbox",
    ),
    author_format="neurips",
    bib_style="plainnat",
    columns=1,
    style_download_url="",
    preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}\n\\usepackage[margin=1in]{geometry}",
)


# ---------------------------------------------------------------------------
# Registry — short aliases point to LATEST version of each conference
# ---------------------------------------------------------------------------

CONFERENCE_REGISTRY: dict[str, ConferenceTemplate] = {
    # Latest (default aliases)
    "neurips": NEURIPS_2025,
    "iclr": ICLR_2026,
    "icml": ICML_2026,
    # Generic for non-ML domains
    "generic": GENERIC,
    "article": GENERIC,
    # Versioned keys (all versions)
    "neurips_2025": NEURIPS_2025,
    "neurips_2024": NEURIPS_2024,
    "iclr_2026": ICLR_2026,
    "iclr_2025": ICLR_2025,
    "icml_2026": ICML_2026,
    "icml_2025": ICML_2025,
}


def get_template(name: str) -> ConferenceTemplate:
    """Look up a conference template by name.

    Raises ``KeyError`` if *name* is not in the registry.
    Accepts both full names (``"neurips_2024"``) and short aliases (``"neurips"``).
    """
    key = name.lower().strip().replace("-", "_").replace(" ", "_")
    if key not in CONFERENCE_REGISTRY:
        available = ", ".join(sorted({t.name for t in CONFERENCE_REGISTRY.values()}))
        raise KeyError(f"Unknown conference template: {name!r}. Available: {available}")
    return CONFERENCE_REGISTRY[key]


def list_conferences() -> list[str]:
    """Return deduplicated list of canonical template names."""
    return sorted({t.name for t in CONFERENCE_REGISTRY.values()})


================================================
FILE: researchclaw/templates/converter.py
================================================
"""Markdown-to-LaTeX converter with conference template support.

Converts a ResearchClaw paper (Markdown with embedded LaTeX math) into a
complete ``.tex`` file using a :class:`ConferenceTemplate` for preamble,
author block, bibliography style, and document structure.

Design constraints:
- **Zero new dependencies** — stdlib only (``re``, ``textwrap``).
- Handles inline math ``\\(...\\)``, display math ``\\[...\\]``,
  bold/italic, bullet lists, numbered lists, code blocks, tables,
  and ``\\cite{...}`` references.
- Extracts abstract from ``# Abstract`` or ``## Abstract`` section.
- ICML two-column structure handled via template's ``render_preamble``.
"""

from __future__ import annotations

import re
import textwrap
import threading
from dataclasses import dataclass, field

from researchclaw.templates.conference import ConferenceTemplate

_render_counters = threading.local()


def _reset_render_counters() -> None:
    """Reset per-render figure and table counters for the current thread."""
    _render_counters.table = 0
    _render_counters.figure = 0


def _next_table_num() -> int:
    """Return the next table number for the current thread."""
    next_num = getattr(_render_counters, "table", 0) + 1
    _render_counters.table = next_num
    return next_num


def _next_figure_num() -> int:
    """Return the next figure number for the current thread."""
    next_num = getattr(_render_counters, "figure", 0) + 1
    _render_counters.figure = next_num
    return next_num

# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def markdown_to_latex(
    paper_md: str,
    template: ConferenceTemplate,
    *,
    title: str = "",
    authors: str = "Anonymous",
    bib_file: str = "references",
    bib_entries: dict[str, str] | None = None,
) -> str:
    """Convert a Markdown paper to a complete LaTeX document.

    Parameters
    ----------
    paper_md:
        Full paper in Markdown with embedded LaTeX math.
    template:
        Conference template controlling preamble and structure.
    title:
        Paper title.  If empty, extracted from ``# Title`` heading or the
        first ``# ...`` heading in *paper_md*.
    authors:
        Author string inserted into the template author block.
    bib_file:
        Bibliography filename (without ``.bib`` extension).
    bib_entries:
        Optional mapping of author-year patterns to cite_keys for
        recovering author-year citations that slipped through earlier
        processing, e.g. ``{"Raissi et al., 2019": "raissi2019physics"}``.

    Returns
    -------
    str
        A complete ``.tex`` file ready for compilation.
    """
    _reset_render_counters()

    paper_md = _preprocess_markdown(paper_md)
    paper_md = _round_raw_metrics(paper_md)
    sections = _parse_sections(paper_md)

    # Extract title from first H1 heading if not provided
    if not title:
        title = _extract_title(sections, paper_md)

    # Extract abstract
    abstract = _extract_abstract(sections)

    # Build body (everything except title/abstract headings)
    body = _build_body(sections, title=title)

    # IMP-30: Detect and remove duplicate tables
    body = _deduplicate_tables(body)

    # R10-Fix5: Completeness check
    completeness_warnings = check_paper_completeness(sections)
    if completeness_warnings:
        import logging

        _logger = logging.getLogger(__name__)
        for warning in completeness_warnings:
            _logger.warning("LaTeX completeness check: %s", warning)
        # BUG-28: Log warnings only — don't inject comments into LaTeX body

    preamble = template.render_preamble(
        title=_escape_latex(title),
        authors=authors,
        abstract=_convert_inline(abstract),
    )
    footer = template.render_footer(bib_file)

    tex = preamble + "\n" + body + footer

    # Final sanitization pass on the complete LaTeX output
    tex = _sanitize_latex_output(tex, bib_entries=bib_entries)

    return tex


# ---------------------------------------------------------------------------
# Post-processing: sanitize final LaTeX
# ---------------------------------------------------------------------------


def _sanitize_latex_output(
    tex: str,
    *,
    bib_entries: dict[str, str] | None = None,
) -> str:
    """Remove artifacts that slip through pre-processing into the final .tex."""
    # 0. BUG-102 safety net: Convert remaining author-year citations to \cite{}.
    #    If upstream conversion missed any [Author et al., 2024] patterns, catch them here.
    if bib_entries:
        for ay_pattern in sorted(bib_entries, key=len, reverse=True):
            cite_key = bib_entries[ay_pattern]
            # [Author et al., 2024] → \cite{key}
            tex = tex.replace(f"[{ay_pattern}]", f"\\cite{{{cite_key}}}")
            # Also handle inside existing brackets (multi-citation)
            tex = tex.replace(ay_pattern, f"\\cite{{{cite_key}}}")
        # Clean up double-nested \cite from multi-citation brackets:
        # [\cite{a}, \cite{b}] → \cite{a, b}
        def _merge_bracket_cites(m: re.Match[str]) -> str:
            inner = m.group(1)
            keys = re.findall(r"\\cite\{([^}]+)\}", inner)
            if keys:
                return "\\cite{" + ", ".join(keys) + "}"
            return m.group(0)
        tex = re.sub(r"\[([^\]]*\\cite\{[^\]]+)\]", _merge_bracket_cites, tex)

    # 1. Remove broken citation markers: \cite{?key:NOT_IN_BIB} or literal [?key:NOT_IN_BIB]
    tex = re.sub(r"\\cite\{\?[^}]*:NOT_IN_BIB\}", "", tex)
    tex = re.sub(r"\[\?[a-zA-Z0-9_:-]+:NOT_IN_BIB\]", "", tex)

    # 1b. Convert leftover raw bracket citations [key2019word, key2020word] → \cite{...}
    # Skip inside verbatim/lstlisting environments to avoid corrupting code blocks.
    _CITE_KEY_PAT_L = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*"
    _VERBATIM_RE = re.compile(
        r"(\\begin\{(?:verbatim|lstlisting|minted)\}.*?\\end\{(?:verbatim|lstlisting|minted)\})",
        re.DOTALL,
    )
    _cite_re = re.compile(
        rf"\[({_CITE_KEY_PAT_L}(?:\s*,\s*{_CITE_KEY_PAT_L})*)\]"
    )

    def _cite_outside_verbatim(tex_src: str) -> str:
        parts = _VERBATIM_RE.split(tex_src)
        for i, part in enumerate(parts):
            if not _VERBATIM_RE.match(part):
                parts[i] = _cite_re.sub(r"\\cite{\1}", part)
        return "".join(parts)

    tex = _cite_outside_verbatim(tex)

    # 1c. BUG-110 safety net: Replace any remaining Unicode Greek/math symbols.
    #     _convert_inline handles most, but titles, captions, and preamble
    #     fragments can still contain raw Unicode that kills pdflatex.
    for _uchar, _lcmd in _UNICODE_GREEK_TO_LATEX.items():
        if _uchar in tex:
            tex = tex.replace(_uchar, _lcmd)

    # 2. Remove HTML entities that survived pre-processing
    tex = tex.replace("&nbsp;", "~")
    tex = tex.replace("&amp;", "\\&")

    # 2b. Fix escaped \& inside tabular data rows.  The converter's
    #     _convert_inline escapes & globally; inside tabular environments
    #     the & must remain unescaped as the column separator.
    if "\\begin{tabular}" in tex and "\\&" in tex:

        def _fix_tabular_amp(m: re.Match[str]) -> str:
            block = m.group(0)
            if "\\&" not in block:
                return block
            lines = block.split("\n")
            for i, line in enumerate(lines):
                if "\\&" in line and "\\\\" in line:
                    lines[i] = line.replace("\\&", "&")
            return "\n".join(lines)

        tex = re.sub(
            r"\\begin\{tabular\}.*?\\end\{tabular\}",
            _fix_tabular_amp,
            tex,
            flags=re.DOTALL,
        )

    # 3. Remove stray markdown code fences in LaTeX body (outside verbatim)
    #    Only match fences NOT inside \begin{verbatim}...\end{verbatim}
    #    Simple approach: remove ``` lines that don't have verbatim nearby
    tex = re.sub(r"^(\s*```[a-z]*\s*)$", r"% removed stray fence: \1", tex, flags=re.MULTILINE)

    # 4. Fix placeholder table captions: \caption{Table N} → descriptive
    #    Can't auto-generate content, but at least don't leave "Table 1" as
    #    the only caption text — append " -- See text for details."
    tex = re.sub(
        r"\\caption\{(Table\s+\d+)\}",
        r"\\caption{\1 -- Summary of experimental results.}",
        tex,
    )

    # 4b. Auto-map orphan \ref{fig:X} to closest \label{fig:Y} by prefix.
    #     The converter generates long labels from captions (fig:overall_cifar_100)
    #     but the LLM references short names (fig:overall).
    fig_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", tex))
    fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", tex))
    orphan_refs = fig_refs - fig_labels
    orphan_labels = fig_labels - fig_refs
    if orphan_refs and orphan_labels:
        for oref in orphan_refs:
            # Find a label that starts with the ref prefix
            candidates = [l for l in orphan_labels if l.startswith(oref)]
            if len(candidates) == 1:
                tex = tex.replace(f"\\ref{{{oref}}}", f"\\ref{{{candidates[0]}}}")
                orphan_labels.discard(candidates[0])

    # 5. Fix "Untitled Paper" / "Running Title" fallback titles
    tex = re.sub(
        r"\\title\{Untitled Paper\}",
        r"\\title{[Title Generation Failed -- Manual Title Required]}",
        tex,
    )
    tex = re.sub(
        r"\\icmltitlerunning\{Running Title\}",
        "",
        tex,
    )

    # 6. Remove \texttt{} wrapped raw metric paths that the LLM dumped
    #    Handles both raw underscores and LaTeX-escaped underscores (\_)
    #    Pattern: condition/env/step/metric_name: value  (3+ path segments)
    tex = re.sub(
        r"\\texttt\{[a-zA-Z0-9_\\_/.:=-]+(?:/[a-zA-Z0-9_\\_/.:=-]+){2,}(?:\s*[=:]\s*[^}]*)?\}",
        "",
        tex,
    )

    # 6b. Remove entire \item lines that are just metric paths
    tex = re.sub(
        r"^\s*\\item\s*\\texttt\{[^}]*\}\s*$",
        "",
        tex,
        flags=re.MULTILINE,
    )

    # 7. Clean up empty \item lines that result from removed content
    tex = re.sub(r"\\item\s*\n\s*\\item", r"\\item", tex)
    # Also remove completely empty \item lines (just whitespace after \item)
    tex = re.sub(r"^\s*\\item\s*$", "", tex, flags=re.MULTILINE)

    # 8. Remove consecutive blank lines (more than 2)
    tex = re.sub(r"\n{3,}", "\n\n", tex)

    return tex


# ---------------------------------------------------------------------------
# Pre-processing
# ---------------------------------------------------------------------------


_OUTER_FENCE_RE = re.compile(
    r"^\s*```(?:markdown|md|latex|tex)?\s*\n(.*?)^\s*```\s*$",
    re.MULTILINE | re.DOTALL,
)

# Greedy variant — matches the *last* closing fence so inner code blocks
# (```text … ```) don't truncate the capture prematurely.
_OUTER_FENCE_GREEDY_RE = re.compile(
    r"^\s*```(?:markdown|md|latex|tex)?\s*\n(.*)^\s*```\s*$",
    re.MULTILINE | re.DOTALL,
)

# Pattern for raw metric values with excessive decimal places
# e.g. 0.9717036975193437 → 0.972
_RAW_METRIC_RE = re.compile(r"(\d+\.\d{5,})")


def _round_raw_metrics(text: str) -> str:
    """Round excessively precise metric values (>4 decimal places).

    Uses significant-figure-aware rounding so small values like
    learning rates (e.g. 0.00001) are preserved instead of becoming 0.0000.
    """
    def _rounder(m: re.Match[str]) -> str:
        try:
            val = float(m.group(1))
            if val == 0.0:
                return "0.0"
            # For very small values (< 0.001), use 2 significant figures
            # to preserve scientific meaning (e.g. lr=0.00003 → 0.00003)
            import math
            abs_val = abs(val)
            if abs_val < 0.001:
                sig_figs = 2
                digits = sig_figs - int(math.floor(math.log10(abs_val))) - 1
                return f"{val:.{digits}f}"
            # Normal range: 4 decimal places
            return f"{val:.4f}"
        except (ValueError, OverflowError):
            return m.group(0)
    return _RAW_METRIC_RE.sub(_rounder, text)


def _preprocess_markdown(md: str) -> str:
    """Clean up common LLM artifacts before parsing.

    1. Strip outer fenced code blocks (e.g. triple-backtick markdown) that LLMs
       around the entire paper content.
    2. Remove standalone Markdown horizontal rules (``---``, ``***``, ``___``).
    3. Convert blockquotes (``> text``) to a form the converter can handle.
    4. Round excessively precise metric values.
    """
    text = md

    # 1. Strip outer markdown fences (LLMs sometimes wrap entire paper in them)
    #    Repeatedly strip in case of double-wrapping.
    #    Try greedy match first (handles papers with inner code blocks),
    #    then fall back to non-greedy if greedy doesn't help.
    for _ in range(3):
        stripped = False
        for pat in (_OUTER_FENCE_GREEDY_RE, _OUTER_FENCE_RE):
            m = pat.search(text)
            if m and len(m.group(1)) > len(text) * 0.5:
                text = m.group(1)
                stripped = True
                break
        if not stripped:
            # Also handle the case where the first line is ```markdown
            # and the last non-blank line is ``` (simple boundary strip)
            lines = text.split("\n")
            first = lines[0].strip() if lines else ""
            last_idx = len(lines) - 1
            while last_idx > 0 and not lines[last_idx].strip():
                last_idx -= 1
            last = lines[last_idx].strip() if last_idx > 0 else ""
            if (
                re.match(r"^```(?:markdown|md|latex|tex)?\s*$", first)
                and last == "```"
            ):
                text = "\n".join(lines[1:last_idx])
                stripped = True
        if not stripped:
            break

    # 2. Remove standalone horizontal rules (---, ***, ___)
    text = re.sub(r"^\s*[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)

    # 2a. Strip HTML entities that LLMs inject into markdown
    text = text.replace("&nbsp;", " ")
    text = text.replace("&amp;", "&")
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    text = text.replace("&mdash;", "---")
    text = text.replace("&ndash;", "--")

    # 2b. Note: stray code fences are handled in _sanitize_latex_output
    #     after conversion, not here (to avoid breaking real code blocks).

    # 2c. Round excessively precise metric values (e.g. 0.9717036975 → 0.9717)
    text = _round_raw_metrics(text)

    # 2d. Remove raw \texttt{...} or backtick-wrapped metric key paths
    # Pattern: \texttt{some/long/metric_path/name: 0.1234} or `path/to/metric: val`
    text = re.sub(
        r"\\texttt\{[a-zA-Z0-9_/.:=-]+(?:/[a-zA-Z0-9_/.:=-]+){2,}(?:\s*[=:]\s*[^}]*)?\}",
        "",
        text,
    )
    # Also strip backtick-wrapped metric paths in markdown source
    text = re.sub(
        r"`[a-zA-Z0-9_/.-]+(?:/[a-zA-Z0-9_/.-]+){2,}(?:\s*[=:]\s*[^`]*)?`",
        "",
        text,
    )

    # 2e. Clean NOT_IN_BIB citation markers: [?key:NOT_IN_BIB] → remove
    text = re.sub(r"\[\?[a-zA-Z0-9_:-]+:NOT_IN_BIB\]", "", text)

    # 3. Convert blockquotes: > text → \begin{quote}text\end{quote}
    #    Collect consecutive > lines into a single quote block.
    lines = text.split("\n")
    out_lines: list[str] = []
    in_quote = False
    quote_buf: list[str] = []
    for line in lines:
        stripped = line.strip()
        if stripped.startswith("> "):
            if not in_quote:
                in_quote = True
                quote_buf = []
            quote_buf.append(stripped[2:])
        elif stripped == ">" and in_quote:
            quote_buf.append("")
        else:
            if in_quote:
                out_lines.append("\\begin{quote}")
                out_lines.extend(quote_buf)
                out_lines.append("\\end{quote}")
                in_quote = False
                quote_buf = []
            out_lines.append(line)
    if in_quote:
        out_lines.append("\\begin{quote}")
        out_lines.extend(quote_buf)
        out_lines.append("\\end{quote}")
    text = "\n".join(out_lines)

    # 4. T1.2: Remove stray markdown/latex/text fences that appear mid-document.
    #    LLMs sometimes emit ```markdown or ```latex between sections.
    #    Only remove documentation fences — preserve code fences (```python etc.)
    _CODE_LANGS = frozenset({
        "python", "java", "cpp", "c", "javascript", "typescript", "rust",
        "go", "ruby", "bash", "sh", "sql", "r", "julia", "lua", "perl",
        "scala", "kotlin", "swift", "haskell", "algorithm", "pseudocode",
    })
    _lines = text.split("\n")
    _cleaned: list[str] = []
    _in_code = False
    for _l in _lines:
        _stripped = _l.strip()
        if _stripped.startswith("```") and not _in_code:
            _lang = _stripped[3:].strip().lower()
            if _lang in _CODE_LANGS or _lang.startswith("algorithm"):
                # Real code block — keep
                _in_code = True
                _cleaned.append(_l)
            elif _lang in ("markdown", "md", "latex", "tex", "text", "", "bibtex"):
                # Documentation/wrapper fence — remove
                pass
            else:
                # Unknown lang — keep to be safe
                _in_code = True
                _cleaned.append(_l)
        elif _stripped == "```" and _in_code:
            # Closing fence for a code block — keep
            _in_code = False
            _cleaned.append(_l)
        elif _stripped == "```" and not _in_code:
            # Stray fence — remove
            pass
        else:
            _cleaned.append(_l)
    text = "\n".join(_cleaned)

    # 5. Normalize mid-line section headings (IMP-17)
    #    LLM output may concatenate sections onto single long lines:
    #      "...text ## Abstract Body text ## 1. Introduction More text..."
    #    Ensure each heading marker starts on its own line so _parse_sections
    #    can detect them with the ^-anchored regex.
    text = re.sub(r"(?<=[^\n]) +(#{1,4}) +", r"\n\n\1 ", text)

    return text


# ---------------------------------------------------------------------------
# Section parsing
# ---------------------------------------------------------------------------

@dataclass
class _Section:
    """A parsed Markdown section."""

    level: int  # 1 = ``#``, 2 = ``##``, 3 = ``###``, etc.
    heading: str
    body: str
    heading_lower: str = field(init=False)

    def __post_init__(self) -> None:
        self.heading_lower = self.heading.strip().lower()


_HEADING_RE = re.compile(r"^(#{1,4})\s+(.+)$", re.MULTILINE)

# Known section heading names used to separate heading from concatenated body
_KNOWN_SECTION_NAMES = {
    "abstract",
    "introduction",
    "related work",
    "background",
    "method",
    "methods",
    "methodology",
    "approach",
    "framework",
    "experiments",
    "experiment",
    "experimental setup",
    "experimental results",
    "results",
    "results and discussion",
    "analysis",
    "discussion",
    "conclusion",
    "conclusions",
    "limitations",
    "acknowledgments",
    "acknowledgements",
    "references",
    "appendix",
    "contributions",
    "problem setting",
    "problem statement",
    "problem definition",
    "problem formulation",
    "study positioning",
    "study positioning and scope",
    "evaluation",
    "evaluation environment",
    "design rationale",
    "complexity",
    "unified algorithm",
    "method positioning",
    "methods compared",
    "common protonet backbone",
    "preference optimization backbone",
}


_HEADING_CONNECTORS = frozenset(
    {
        "and", "or", "for", "in", "of", "the", "a", "an", "with",
        "under", "to", "on", "at", "by", "as", "via", "from",
        "not", "but", "yet", "nor", "vs", "versus", "is", "are",
    }
)

_SENTENCE_STARTERS = frozenset(
    {
        "the", "a", "an", "this", "these", "those", "that",
        "it", "we", "our", "their", "its", "each", "every",
        "in", "for", "to", "here", "there", "however", "moreover",
        "furthermore", "additionally", "specifically", "notably",
        "all", "many", "several", "some", "most", "both",
        "among", "between", "across", "unlike", "given", "such",
        "while", "although", "because", "since", "when", "where",
        "rather", "let", "table", "figure", "as", "at", "if",
    }
)


def _separate_heading_body(heading: str) -> tuple[str, str]:
    """Separate heading text from accidentally concatenated body text.

    LLM output may produce lines like ``## Abstract Body text here...``
    where the heading is just ``Abstract`` and the rest is body.

    Returns (heading, extra_body) where extra_body may be empty.
    """
    # Very short headings are fine as-is
    if len(heading) <= 60:
        return heading, ""

    # Strip optional leading section number for matching
    num_match = re.match(r"^(\d+(?:\.\d+)*\.?\s+)", heading)
    num_prefix = num_match.group(1) if num_match else ""
    rest = heading[len(num_prefix):]
    rest_lower = rest.lower()

    # Check against known section heading names
    for name in sorted(_KNOWN_SECTION_NAMES, key=len, reverse=True):
        if rest_lower.startswith(name) and len(rest) > len(name) + 1:
            after = rest[len(name) :]
            if after and after[0] in " \t":
                return (num_prefix + rest[: len(name)]).strip(), after.strip()

    # Word-count heuristic for unknown subsection headings.
    # Scan for the first plausible heading-body boundary.
    words = heading.split()
    if len(words) > 6:
        for n in range(2, min(12, len(words) - 2)):
            curr = words[n]
            if not curr or not curr[0].isupper():
                continue
            prev_word = words[n - 1].rstrip(".,;:").lower()
            if prev_word in _HEADING_CONNECTORS:
                continue
            remaining = " ".join(words[n:])
            if len(remaining) <= 30:
                continue
            # Strong signal: common sentence-starting word
            if curr.lower() in _SENTENCE_STARTERS:
                return " ".join(words[:n]).strip(), remaining.strip()
            # Medium signal: next word is lowercase (sentence-like)
            # and heading has >= 4 words, body is substantial (> 100 chars)
            if n >= 4 and n + 1 < len(words):
                next_w = words[n + 1].rstrip(".,;:")
                if next_w and next_w[0].islower() and len(remaining) > 100:
                    return " ".join(words[:n]).strip(), remaining.strip()
            # Weak fallback for very long headings (conservative)
            if n >= 8 and len(remaining) > 100:
                return " ".join(words[:n]).strip(), remaining.strip()

    # Detect repeated multi-word opening phrase: the body often starts with
    # the same words as the heading (e.g. "Graph-memory methods Graph-memory
    # methods maintain a graph...").
    half = len(rest) // 2
    for phrase_len in range(min(30, half), 14, -1):
        phrase = rest[:phrase_len]
        if " " not in phrase:
            continue
        repeat_pos = rest.find(phrase, phrase_len)
        if repeat_pos > 0:
            return (
                (num_prefix + rest[:repeat_pos]).strip(),
                rest[repeat_pos:].strip(),
            )

    # Fallback: try to split at a sentence boundary within first 200 chars
    if len(heading) > 200:
        m = re.search(r"[.;:]\s+([A-Z])", heading[:300])
        if m and m.start() > 10:
            return heading[: m.start() + 1].strip(), heading[m.start() + 2 :].strip()

    return heading, ""


def _parse_sections(md: str) -> list[_Section]:
    """Split Markdown into a flat list of sections by heading."""
    matches = list(_HEADING_RE.finditer(md))
    if not matches:
        return [_Section(level=1, heading="", body=md)]

    sections: list[_Section] = []

    # Text before first heading (if any)
    if matches[0].start() > 0:
        preamble_text = md[: matches[0].start()].strip()
        if preamble_text:
            sections.append(_Section(level=0, heading="", body=preamble_text))

    for i, m in enumerate(matches):
        level = len(m.group(1))
        heading = m.group(2).strip()
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(md)
        body = md[start:end].strip()

        # IMP-17: Handle concatenated heading+body on same line
        heading, body_prefix = _separate_heading_body(heading)
        if body_prefix:
            body = body_prefix + ("\n\n" + body if body else "")

        sections.append(_Section(level=level, heading=heading, body=body))

    return sections


# ---------------------------------------------------------------------------
# Extraction helpers
# ---------------------------------------------------------------------------

_TITLE_SKIP = {
    "title",
    "abstract",
    "references",
    "appendix",
    "acknowledgments",
    "acknowledgements",
}

# T1.1: Headings that are NOT valid paper titles (tables, figures, etc.)
_TITLE_REJECT_RE = re.compile(
    r"^(?:table|figure|fig\.|tab\.|algorithm|listing|appendix)\s",
    re.IGNORECASE,
)

# T1.1: Headings that look like metric dumps rather than titles
_METRIC_DUMP_RE = re.compile(
    r"(?:primary_metric|accuracy|loss|f1_score|precision|recall)\b",
    re.IGNORECASE,
)


def _extract_title(sections: list[_Section], raw_md: str) -> str:
    """Extract paper title from sections or raw markdown."""
    # Look for an explicit "# Title" or "## Title" section whose body is the
    # actual title, or whose heading is "## Title Actual Paper Title".
    for sec in sections:
        if sec.level in (1, 2) and sec.heading_lower == "title":
            # The body often starts with **Bold Title** on the first line
            first_line = sec.body.split("\n")[0].strip()
            # Strip bold markers
            first_line = re.sub(r"\*\*(.+?)\*\*", r"\1", first_line)
            if first_line and not _is_bad_title(first_line):
                return first_line
        # Handle "## Title Actual Paper Title" pattern (title embedded in heading)
        if sec.level in (1, 2) and sec.heading_lower.startswith("title ") and len(sec.heading) > 6:
            return sec.heading[6:].strip()

    # Fallback: first H1/H2 heading that isn't a meta-heading or artefact
    for sec in sections:
        if (
            sec.level in (1, 2)
            and sec.heading
            and sec.heading_lower not in _TITLE_SKIP
            and not _is_bad_title(sec.heading)
        ):
            return sec.heading

    # Last resort: first non-empty line (still filtered)
    for line in raw_md.splitlines():
        stripped = line.strip().lstrip("#").strip()
        if stripped and not _is_bad_title(stripped):
            return stripped
    return "Untitled Paper"


def _is_bad_title(candidate: str) -> bool:
    """Return True if *candidate* is clearly not a paper title."""
    # Reject "Table 1 – ...", "Figure 2: ...", etc.
    if _TITLE_REJECT_RE.match(candidate):
        return True
    # Reject raw metric key dumps
    if _METRIC_DUMP_RE.search(candidate):
        return True
    # Reject if it contains raw underscore variable names (e.g. primary_metric)
    if re.search(r"\w+_\w+/\w+", candidate):
        return True
    return False


def _extract_abstract(sections: list[_Section]) -> str:
    """Extract abstract text from sections."""
    for sec in sections:
        if sec.heading_lower == "abstract":
            return sec.body
        # IMP-17 fallback: heading may still contain body text if
        # _separate_heading_body didn't recognise the pattern.
        if sec.heading_lower.startswith("abstract ") and len(sec.heading) > 20:
            extra = sec.heading[len("Abstract") :].strip()
            return extra + ("\n\n" + sec.body if sec.body else "")
    return ""


# ---------------------------------------------------------------------------
# Body building
# ---------------------------------------------------------------------------

_SKIP_HEADINGS = {"title", "abstract"}


def _build_body(sections: list[_Section], *, title: str = "") -> str:
    """Convert all non-title/abstract sections to LaTeX body text.

    When a paper has its title as an H1 heading (``# My Paper Title``),
    that heading is already rendered via ``\\title{}`` in the preamble.
    We skip it here and promote remaining headings so that H2 (``##``)
    maps to ``\\section``, H3 to ``\\subsection``, etc.
    """
    title_lower = title.strip().lower()

    # Determine minimum heading level used for real body sections
    # (skip title/abstract/references).
    title_h1_found = False
    for sec in sections:
        if (
            sec.level == 1
            and sec.heading
            and sec.heading.strip().lower() == title_lower
        ):
            title_h1_found = True
            break

    # T1.3: Auto-detect when all body sections use H2 (##) instead of H1 (#).
    # This happens when the LLM uses ## for main sections (Introduction, Method, etc.)
    # without an explicit H1 title heading. We must promote H2→\section.
    body_levels: set[int] = set()
    for sec in sections:
        if sec.heading_lower not in _SKIP_HEADINGS and sec.level >= 1:
            if not (sec.level == 1 and sec.heading.strip().lower() == title_lower):
                body_levels.add(sec.level)

    min_body_level = min(body_levels) if body_levels else 1

    # Promote if: (a) title was H1 and body starts at H2, OR
    # (b) no title H1 found but all body sections are H2+ (LLM omitted H1 title)
    # BUG-166: When title is H1 AND body also uses H1 for main sections,
    # offset must be 0 — otherwise H1→max(1,1-1)=1 and H2→max(1,2-1)=1
    # both collapse to \section, losing all subsection hierarchy.
    if title_h1_found:
        level_offset = 1 if min_body_level >= 2 else 0
    elif min_body_level >= 2:
        # All body sections are H2 or deeper — promote so H2→\section
        level_offset = min_body_level - 1
    else:
        level_offset = 0

    _level_map = {
        1: "section",
        2: "subsection",
        3: "subsubsection",
        4: "paragraph",
    }

    parts: list[str] = []
    for sec in sections:
        # Skip title-only and abstract sections
        if sec.heading_lower in _SKIP_HEADINGS:
            continue
        # Skip the H1 heading that was used as the paper title
        if (
            sec.level == 1
            and sec.heading
            and sec.heading.strip().lower() == title_lower
        ):
            continue
        if sec.level == 0:
            # Preamble text before any heading — include as-is
            parts.append(_convert_block(sec.body))
            continue

        effective_level = max(1, sec.level - level_offset)
        cmd = _level_map.get(effective_level, "paragraph")
        heading_tex = _escape_latex(sec.heading)
        # Strip leading manual section numbers: "1. Introduction" → "Introduction"
        # Handles: "1 Intro", "2.1 Related", "3.2.1 Details", "1. Intro"
        heading_tex = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", heading_tex)
        parts.append(f"\\{cmd}{{{heading_tex}}}")
        # Generate a label for cross-referencing
        if cmd in ("section", "subsection", "subsubsection"):
            label_key = re.sub(r"[^a-z0-9]+", "_", heading_tex.lower()).strip("_")[:40]
            if label_key:
                parts.append(f"\\label{{sec:{label_key}}}")
        if sec.body:
            parts.append(_convert_block(sec.body))

    return "\n\n".join(parts) + "\n"


def _deduplicate_tables(body: str) -> str:
    """IMP-30: Remove duplicate tables that share the same header row.

    LLMs sometimes repeat tables (e.g. same results table in Results and
    Discussion). We keep the first occurrence and drop subsequent copies.
    """
    import logging as _dup_log

    _TABLE_ENV_RE = re.compile(
        r"(\\begin\{table\}.*?\\end\{table\})", re.DOTALL
    )
    tables = list(_TABLE_ENV_RE.finditer(body))
    if len(tables) < 2:
        return body

    seen_headers: dict[str, int] = {}
    drop_spans: list[tuple[int, int]] = []
    for m in tables:
        table_text = m.group(1)
        # Extract header row (first row after \toprule)
        header_match = re.search(r"\\toprule\s*\n(.+?)\\\\", table_text)
        if not header_match:
            continue
        header_key = re.sub(r"\s+", " ", header_match.group(1).strip())
        if header_key in seen_headers:
            drop_spans.append((m.start(), m.end()))
            _dup_log.getLogger(__name__).info(
                "IMP-30: Dropping duplicate table (same header as table #%d)",
                seen_headers[header_key],
            )
        else:
            seen_headers[header_key] = len(seen_headers) + 1

    # Remove duplicates in reverse order to preserve offsets
    for start, end in reversed(drop_spans):
        body = body[:start] + body[end:]

    return body


# ---------------------------------------------------------------------------
# Block-level conversion
# ---------------------------------------------------------------------------

# Patterns for block-level structures
_DISPLAY_MATH_RE = re.compile(r"^\\\[(.+?)\\\]$", re.MULTILINE | re.DOTALL)
# $$...$$ display math (single- or multi-line)
_DISPLAY_MATH_DOLLAR_RE = re.compile(
    r"^\$\$\s*\n?(.*?)\n?\s*\$\$$", re.MULTILINE | re.DOTALL
)
_FENCED_CODE_RE = re.compile(r"^```(\w*)\n(.*?)^```", re.MULTILINE | re.DOTALL)
_TABLE_SEP_RE = re.compile(r"^\|[-:| ]+\|$")

# Markdown image pattern: ![caption](path)
_IMAGE_RE = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)\s*$")

# Bullet / numbered list patterns
_BULLET_RE = re.compile(r"^(\s*)-\s+(.+)")
_NUMBERED_RE = re.compile(r"^(\s*)\d+\.\s+(.+)")


def _convert_block(text: str) -> str:
    """Convert a block of Markdown body text to LaTeX."""
    # Protect display math from further processing
    math_blocks: list[str] = []

    def _stash_math(m: re.Match[str]) -> str:
        idx = len(math_blocks)
        math_blocks.append(m.group(0))  # Keep \\[...\\] as-is
        return f"%%MATH_BLOCK_{idx}%%"

    def _stash_dollar_math(m: re.Match[str]) -> str:
        """Convert $$...$$ to \\begin{equation}...\\end{equation}."""
        idx = len(math_blocks)
        inner = m.group(1).strip()
        math_blocks.append(
            f"\\begin{{equation}}\n{inner}\n\\end{{equation}}"
        )
        return f"%%MATH_BLOCK_{idx}%%"

    text = _DISPLAY_MATH_RE.sub(_stash_math, text)
    # Also handle $$...$$ display math
    text = _DISPLAY_MATH_DOLLAR_RE.sub(_stash_dollar_math, text)

    # Protect fenced code blocks
    code_blocks: list[str] = []

    def _stash_code(m: re.Match[str]) -> str:
        idx = len(code_blocks)
        lang = m.group(1) or ""
        code = m.group(2)
        code_blocks.append(_render_code_block(lang, code))
        return f"%%CODE_BLOCK_{idx}%%"

    text = _FENCED_CODE_RE.sub(_stash_code, text)

    # Protect raw LaTeX environments (table, figure, algorithm, etc.)
    # These appear when pre-built LaTeX (e.g. anti-fabrication result tables)
    # is embedded directly in the markdown.  Without protection, their
    # contents go through _convert_inline which double-escapes {, }, _, &.
    latex_env_blocks: list[str] = []

    def _stash_latex_env(m: re.Match[str]) -> str:
        idx = len(latex_env_blocks)
        latex_env_blocks.append(m.group(0))
        return f"%%LATEX_ENV_{idx}%%"

    # Match \begin{env}...\end{env} for environments that should pass through.
    text = re.sub(
        r"\\begin\{(table|figure|tabular|algorithm|algorithmic|equation|align"
        r"|gather|multline|minipage|tikzpicture)\*?\}.*?"
        r"\\end\{\1\*?\}",
        _stash_latex_env,
        text,
        flags=re.DOTALL,
    )

    # Process line by line for lists, tables, and paragraphs
    lines = text.split("\n")
    output: list[str] = []
    i = 0
    while i < len(lines):
        line = lines[i]

        # Check for stashed blocks
        if line.strip().startswith("%%MATH_BLOCK_"):
            idx = int(re.search(r"\d+", line.strip()).group())  # type: ignore[union-attr]
            output.append(math_blocks[idx])
            i += 1
            continue

        if line.strip().startswith("%%CODE_BLOCK_"):
            idx = int(re.search(r"\d+", line.strip()).group())  # type: ignore[union-attr]
            output.append(code_blocks[idx])
            i += 1
            continue

        # Stashed LaTeX environments — pass through unchanged
        if line.strip().startswith("%%LATEX_ENV_"):
            idx = int(re.search(r"\d+", line.strip()).group())  # type: ignore[union-attr]
            output.append(latex_env_blocks[idx])
            i += 1
            continue

        # Bullet list
        if _BULLET_RE.match(line):
            items, i = _collect_list(lines, i, _BULLET_RE)
            output.append(_render_itemize(items))
            continue

        # Numbered list
        if _NUMBERED_RE.match(line):
            items, i = _collect_list(lines, i, _NUMBERED_RE)
            output.append(_render_enumerate(items))
            continue

        # Table detection (line starts with |)
        if (
            line.strip().startswith("|")
            and i + 1 < len(lines)
            and _TABLE_SEP_RE.match(lines[i + 1].strip())
        ):
            # Check if previous line is a table caption (e.g. **Table 1: ...**)
            table_caption = ""
            if output:
                prev = output[-1].strip()
                # Match bold caption: \textbf{Table N...} (already converted)
                # or raw markdown: **Table N: ...**
                cap_m = re.match(
                    r"(?:\\textbf\{|[*]{2})\s*Table\s+\d+[.:]?\s*(.*?)(?:\}|[*]{2})$",
                    prev,
                )
                if cap_m:
                    table_caption = f"Table {cap_m.group(1)}" if cap_m.group(1) else ""
                    if not table_caption:
                        table_caption = prev
                    output.pop()  # Remove caption line from output (now inside table)
            table_lines, i = _collect_table(lines, i)
            output.append(_render_table(table_lines, caption=table_caption))
            continue

        # Markdown image: ![caption](path)
        img_match = _IMAGE_RE.match(line.strip())
        if img_match:
            output.append(_render_figure(img_match.group(1), img_match.group(2)))
            i += 1
            continue

        # Regular paragraph line
        output.append(_convert_inline(line))
        i += 1

    return "\n".join(output)


# ---------------------------------------------------------------------------
# List handling
# ---------------------------------------------------------------------------


def _collect_list(
    lines: list[str], start: int, pattern: re.Pattern[str]
) -> tuple[list[str], int]:
    """Collect consecutive list items matching *pattern*."""
    items: list[str] = []
    i = start
    while i < len(lines):
        m = pattern.match(lines[i])
        if m:
            items.append(m.group(2))
            i += 1
        elif lines[i].strip() == "":
            # Blank line — might continue list or end it
            if i + 1 < len(lines) and pattern.match(lines[i + 1]):
                i += 1  # skip blank, continue
            else:
                break
        elif lines[i].startswith("  ") or lines[i].startswith("\t"):
            # Continuation of previous item
            if items:
                items[-1] += " " + lines[i].strip()
            i += 1
        else:
            break
    return items, i


def _render_itemize(items: list[str]) -> str:
    inner = "\n".join(f"  \\item {_convert_inline(item)}" for item in items)
    return f"\\begin{{itemize}}\n{inner}\n\\end{{itemize}}"


def _render_enumerate(items: list[str]) -> str:
    inner = "\n".join(f"  \\item {_convert_inline(item)}" for item in items)
    return f"\\begin{{enumerate}}\n{inner}\n\\end{{enumerate}}"


# ---------------------------------------------------------------------------
# Table handling
# ---------------------------------------------------------------------------


def _collect_table(lines: list[str], start: int) -> tuple[list[str], int]:
    """Collect table lines (header + separator + body rows)."""
    table: list[str] = []
    i = start
    while i < len(lines) and lines[i].strip().startswith("|"):
        table.append(lines[i])
        i += 1
    return table, i


def _render_table(table_lines: list[str], caption: str = "") -> str:
    """Render a Markdown table as a LaTeX tabular inside a table environment.

    IMP-23: Auto-wraps in ``\\resizebox`` when columns > 5 or any cell
    text exceeds 25 characters, preventing overflow in conference formats.
    IMP-32: Generates descriptive captions from header columns when the
    caption is empty or just 'Table N'.
    """
    if len(table_lines) < 2:
        return ""

    header = _parse_table_row(table_lines[0])
    # Skip separator (line 1)
    body_rows = [_parse_table_row(line) for line in table_lines[2:] if line.strip()]
    ncols = len(header)

    # Determine alignment from separator
    alignments = _parse_alignments(table_lines[1], ncols)
    col_spec = "".join(alignments)

    table_num = _next_table_num()

    # IMP-23: Detect wide tables that need resizebox
    max_cell_len = max(
        (len(c) for row in [header] + body_rows for c in row),
        default=0,
    )
    needs_resize = ncols > 5 or max_cell_len > 25

    lines_out: list[str] = []
    lines_out.append("\\begin{table}[ht]")
    lines_out.append("\\centering")

    # Caption ABOVE table (standard academic convention)
    if caption:
        cap_text = re.sub(r"^Table\s+\d+[.:]\s*", "", caption).strip()
        if cap_text:
            lines_out.append(f"\\caption{{{_convert_inline(cap_text)}}}")
        else:
            auto_cap = _auto_table_caption(header, table_num)
            lines_out.append(f"\\caption{{{auto_cap}}}")
    else:
        auto_cap = _auto_table_caption(header, table_num)
        lines_out.append(f"\\caption{{{auto_cap}}}")
    lines_out.append(f"\\label{{tab:{table_num}}}")

    if needs_resize:
        # BUG-109b fix: Use \columnwidth (works in both 1-col and 2-col layouts)
        # \textwidth in 2-column formats (ICML) is full page width, causing
        # floats wider than a column to be "lost" by LaTeX.
        lines_out.append("\\resizebox{\\columnwidth}{!}{%")
    lines_out.append(f"\\begin{{tabular}}{{{col_spec}}}")
    lines_out.append("\\toprule")
    lines_out.append(
        " & ".join(f"\\textbf{{{_convert_inline(c)}}}" for c in header) + " \\\\"
    )
    lines_out.append("\\midrule")
    for row in body_rows:
        # Pad row to match header length
        padded = row + [""] * (ncols - len(row))
        lines_out.append(
            " & ".join(_convert_inline(c) for c in padded[:ncols]) + " \\\\"
        )
    lines_out.append("\\bottomrule")
    lines_out.append("\\end{tabular}")
    if needs_resize:
        lines_out.append("}")  # close resizebox
    lines_out.append("\\end{table}")

    return "\n".join(lines_out)


def _auto_table_caption(header: list[str], table_num: int) -> str:
    """IMP-32: Generate a descriptive caption from table header columns."""
    if len(header) <= 1:
        return f"Table {table_num}"
    cols = [c.strip() for c in header if c.strip()]
    if len(cols) < 2:
        return f"Table {table_num}"
    col0 = cols[0].lower()
    rest = [_convert_inline(c) for c in cols[1:min(5, len(cols))]]
    # Detect common table types by first-column header
    _HP_HINTS = {"hyperparameter", "parameter", "param", "hp", "setting", "config"}
    _ABL_HINTS = {"component", "variant", "ablation", "configuration", "module"}
    _MODEL_HINTS = {"model", "method", "approach", "algorithm", "baseline"}
    if any(h in col0 for h in _HP_HINTS):
        return f"Hyperparameter settings"
    if any(h in col0 for h in _ABL_HINTS):
        return f"Ablation study results across {', '.join(rest)}"
    if any(h in col0 for h in _MODEL_HINTS):
        return f"Performance comparison of different methods on {', '.join(rest)}"
    return f"Comparison of {_convert_inline(cols[0])} across {', '.join(rest)}"


def _parse_table_row(line: str) -> list[str]:
    """Parse ``| a | b | c |`` into ``['a', 'b', 'c']``."""
    line = line.strip()
    if line.startswith("|"):
        line = line[1:]
    if line.endswith("|"):
        line = line[:-1]
    return [cell.strip() for cell in line.split("|")]


def _parse_alignments(sep_line: str, ncols: int) -> list[str]:
    """Parse alignment indicators from separator line."""
    cells = _parse_table_row(sep_line)
    aligns: list[str] = []
    for cell in cells:
        raw = cell.strip()
        left = raw.startswith(":")
        right = raw.endswith(":")
        if left and right:
            aligns.append("c")
        elif right:
            aligns.append("r")
        else:
            aligns.append("l")
    # Pad to ncols
    while len(aligns) < ncols:
        aligns.append("l")
    return aligns[:ncols]


# ---------------------------------------------------------------------------
# Code block rendering
# ---------------------------------------------------------------------------


_UNICODE_TO_ASCII: dict[str, str] = {
    "\u2190": "<-",   "\u2192": "->",   "\u21d0": "<=",   "\u21d2": "=>",
    "\u2264": "<=",   "\u2265": ">=",   "\u2260": "!=",   "\u2248": "~=",
    "\u2208": " in ", "\u2209": " not in ",
    "\u2200": "forall ", "\u2203": "exists ",
    "\u2207": "nabla", "\u221e": "inf",  "\u00b1": "+/-",
    "\u00d7": "x",    "\u00b7": "*",    "\u2026": "...",
    "\u03b1": "alpha", "\u03b2": "beta", "\u03b3": "gamma",
    "\u03b4": "delta", "\u03b5": "epsilon", "\u03b6": "zeta",
    "\u03b7": "eta",   "\u03b8": "theta", "\u03b9": "iota",
    "\u03ba": "kappa", "\u03bb": "lambda", "\u03bc": "mu",
    "\u03bd": "nu",    "\u03be": "xi",    "\u03c0": "pi",
    "\u03c1": "rho",   "\u03c3": "sigma",  "\u03c4": "tau",
    "\u03c5": "upsilon", "\u03c6": "phi", "\u03c7": "chi",
    "\u03c8": "psi",   "\u03c9": "omega",
    "\u0394": "Delta", "\u0398": "Theta", "\u039b": "Lambda",
    "\u03a3": "Sigma", "\u03a6": "Phi",   "\u03a8": "Psi",
    "\u03a9": "Omega",
    "\u2113": "ell",   "\u2202": "d",     "\u222b": "int",
}


# BUG-110: Unicode Greek → LaTeX math replacements for inline text.
# Used in _convert_inline() and _sanitize_latex_output().
_UNICODE_GREEK_TO_LATEX: dict[str, str] = {
    # Lowercase
    "\u03b1": "$\\alpha$", "\u03b2": "$\\beta$", "\u03b3": "$\\gamma$",
    "\u03b4": "$\\delta$", "\u03b5": "$\\epsilon$", "\u03b6": "$\\zeta$",
    "\u03b7": "$\\eta$", "\u03b8": "$\\theta$", "\u03b9": "$\\iota$",
    "\u03ba": "$\\kappa$", "\u03bb": "$\\lambda$", "\u03bc": "$\\mu$",
    "\u03bd": "$\\nu$", "\u03be": "$\\xi$", "\u03c0": "$\\pi$",
    "\u03c1": "$\\rho$", "\u03c3": "$\\sigma$", "\u03c4": "$\\tau$",
    "\u03c5": "$\\upsilon$", "\u03c6": "$\\phi$", "\u03c7": "$\\chi$",
    "\u03c8": "$\\psi$", "\u03c9": "$\\omega$",
    # Uppercase
    "\u0393": "$\\Gamma$", "\u0394": "$\\Delta$", "\u0398": "$\\Theta$",
    "\u039b": "$\\Lambda$", "\u039e": "$\\Xi$", "\u03a0": "$\\Pi$",
    "\u03a3": "$\\Sigma$", "\u03a6": "$\\Phi$", "\u03a8": "$\\Psi$",
    "\u03a9": "$\\Omega$",
    # Common math symbols not already handled
    "\u2200": "$\\forall$", "\u2203": "$\\exists$",
    "\u2207": "$\\nabla$", "\u2202": "$\\partial$",
    "\u2026": "\\ldots{}", "\u22c5": "$\\cdot$",
    "\u2113": "$\\ell$", "\u222b": "$\\int$",
    "\u2209": "$\\notin$",
    # Common symbols that cause null-byte corruption if not converted
    "\u00b1": "$\\pm$",        # ±
    "\u00d7": "$\\times$",     # ×
    "\u2248": "$\\approx$",    # ≈
    "\u2264": "$\\leq$",       # ≤
    "\u2265": "$\\geq$",       # ≥
    "\u2260": "$\\neq$",       # ≠
    "\u221e": "$\\infty$",     # ∞
    # Additional symbols found in Runs 49-52
    "\u2212": "$-$",           # − (minus sign, distinct from hyphen)
    "\u2282": "$\\subset$",    # ⊂
    "\u222a": "$\\cup$",       # ∪
    "\u211d": "$\\mathbb{R}$", # ℝ
    "\u0302": "\\^{}",         # ̂  (combining circumflex)
    "\u0303": "\\~{}",         # ̃  (combining tilde — Run 61 pseudocode)
    "\u221d": "$\\propto$",    # ∝ (proportional to)
    "\u2208": "$\\in$",        # ∈
}

_ALGO_KEYWORDS = re.compile(
    r"\b(Input|Output|Return|While|For|If|Else|Repeat|Until|Function|Procedure|Algorithm)\b",
    re.IGNORECASE,
)


def _escape_algo_line(line: str) -> str:
    """Escape LaTeX special characters in an algorithmic pseudocode line.

    BUG-177: Raw pseudocode lines contain Python/math syntax that breaks
    pdflatex: ``#`` (comment char), ``_`` (subscript), ``%`` (comment),
    ``&`` (alignment), ``{}``, ``~``, ``^``.

    Strategy:
    1. Convert ``# comment`` at end of line → ``\\COMMENT{comment}``
    2. Protect existing LaTeX commands and math delimiters
    3. Escape remaining special characters
    """
    # Step 1: Convert Python-style end-of-line comments → \COMMENT{...}
    # Match `# comment` that isn't at the start of the line (those are full-line comments)
    _comment_match = re.search(r"(?<=\s)#\s*(.+)$", line)
    comment_suffix = ""
    if _comment_match:
        comment_text = _comment_match.group(1).strip()
        line = line[: _comment_match.start()].rstrip()
        comment_suffix = f" \\COMMENT{{{comment_text}}}"
    elif line.strip().startswith("#"):
        # Full-line comment
        comment_text = line.strip().lstrip("#").strip()
        return f"\\COMMENT{{{comment_text}}}"

    # Step 2: Protect existing LaTeX commands and math mode from escaping
    protected: list[str] = []

    def _protect(m: re.Match[str]) -> str:
        idx = len(protected)
        protected.append(m.group(0))
        return f"\x00ALG{idx}\x00"

    # Protect: \command{...}, $...$, \(...\)
    line = re.sub(r"\\[a-zA-Z]+\{[^}]*\}", _protect, line)
    line = re.sub(r"\$[^$]+\$", _protect, line)
    line = re.sub(r"\\\(.+?\\\)", _protect, line)

    # Step 3: Escape special characters
    line = line.replace("&", "\\&")
    line = line.replace("%", "\\%")
    line = line.replace("#", "\\#")
    line = line.replace("_", "\\_")
    line = line.replace("{", "\\{")
    line = line.replace("}", "\\}")
    line = line.replace("~", "\\textasciitilde{}")
    line = line.replace("^", "\\textasciicircum{}")

    # Step 4: Restore protected regions
    for idx, val in enumerate(protected):
        line = line.replace(f"\x00ALG{idx}\x00", val)

    return line + comment_suffix


def _render_code_block(lang: str, code: str) -> str:
    """Render a fenced code block as a LaTeX environment.

    IMP-28: Detects pseudocode blocks (language hint 'algorithm' /
    'pseudocode', or 3+ algorithm keywords) and renders them inside an
    ``algorithm`` + ``algorithmic`` environment instead of verbatim.

    Replaces Unicode characters (Greek letters, arrows, math symbols)
    with ASCII equivalents so pdflatex can compile the block.
    """
    import unicodedata

    escaped = code.rstrip("\n")
    for uni, ascii_eq in _UNICODE_TO_ASCII.items():
        escaped = escaped.replace(uni, ascii_eq)
    # Strip combining characters (tildes, hats, etc.) that break pdflatex
    escaped = "".join(
        c for c in escaped if not unicodedata.combining(c)
    )

    # IMP-28: Detect pseudocode and use algorithm environment
    lang_lower = lang.lower().strip()
    is_algo = lang_lower in ("algorithm", "pseudocode", "algo")
    if not is_algo:
        # Heuristic: ≥3 algorithm keywords → treat as pseudocode
        is_algo = len(_ALGO_KEYWORDS.findall(escaped)) >= 3

    if is_algo:
        # Extract caption from first comment line if present
        algo_lines = escaped.split("\n")
        caption = "Algorithm"
        if algo_lines and algo_lines[0].strip().startswith("//"):
            caption = algo_lines[0].strip().lstrip("/ ").strip()
            algo_lines = algo_lines[1:]
        # Wrap raw lines in \STATE unless they already use algorithmic commands
        _algo_cmds = {"\\STATE", "\\IF", "\\ELSE", "\\ELSIF", "\\ENDIF",
                       "\\FOR", "\\ENDFOR", "\\WHILE", "\\ENDWHILE",
                       "\\REPEAT", "\\UNTIL", "\\RETURN", "\\REQUIRE", "\\ENSURE"}
        wrapped_lines = []
        for al in algo_lines:
            stripped = al.strip()
            if not stripped:
                continue
            if any(stripped.startswith(cmd) for cmd in _algo_cmds):
                wrapped_lines.append(stripped)
            else:
                # BUG-177: Escape LaTeX special chars in pseudocode lines
                wrapped_lines.append(f"\\STATE {_escape_algo_line(stripped)}")
        body = "\n".join(wrapped_lines)
        return (
            "\\begin{algorithm}[ht]\n"
            f"\\caption{{{_convert_inline(caption)}}}\n"
            "\\begin{algorithmic}[1]\n"
            f"{body}\n"
            "\\end{algorithmic}\n"
            "\\end{algorithm}"
        )

    return f"\\begin{{verbatim}}\n{escaped}\n\\end{{verbatim}}"


# ---------------------------------------------------------------------------
# Figure rendering
# ---------------------------------------------------------------------------

def _render_figure(caption: str, path: str) -> str:
    """Render a markdown image as a LaTeX figure environment."""
    fig_num = _next_figure_num()
    # Sanitize path for LaTeX: replace spaces, keep underscores
    path = path.replace(" ", "_")
    cap_tex = _convert_inline(caption) if caption else f"Figure {fig_num}"
    label_key = re.sub(r"[^a-z0-9]+", "_", caption.lower()).strip("_")[:30]
    if not label_key:
        label_key = str(fig_num)
    return (
        "\\begin{figure}[t]\n"
        "\\centering\n"
        f"\\includegraphics[width=0.95\\columnwidth]{{{path}}}\n"
        f"\\caption{{{cap_tex}}}\n"
        f"\\label{{fig:{label_key}}}\n"
        "\\end{figure}"
    )


# ---------------------------------------------------------------------------
# Inline conversion
# ---------------------------------------------------------------------------

# Order matters: process bold before italic to avoid conflicts.
_BOLD_RE = re.compile(r"\*\*(.+?)\*\*")
_ITALIC_RE = re.compile(r"(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)")
_INLINE_CODE_RE = re.compile(r"`([^`]+)`")
_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")

# Characters that need escaping in LaTeX (but NOT inside math or \cite)
_LATEX_SPECIAL = re.compile(r"([#%&_{}])")
_LATEX_TILDE = re.compile(r"~")
_LATEX_CARET = re.compile(r"\^")
_LATEX_DOLLAR = re.compile(r"(?<!\\)\$")


def _convert_inline(text: str) -> str:
    """Convert inline Markdown formatting to LaTeX.

    Preserves:
    - Inline math ``\\(...\\)`` and ``$...$``
    - ``\\cite{...}`` references
    - Display math markers (already handled at block level)
    """
    # Normalize Unicode punctuation to LaTeX equivalents
    text = text.replace("\u2014", "---")          # em-dash —
    text = text.replace("\u2013", "--")            # en-dash –
    text = text.replace("\u201c", "``")            # left double quote "
    text = text.replace("\u201d", "''")            # right double quote "
    text = text.replace("\u2018", "`")             # left single quote '
    text = text.replace("\u2019", "'")             # right single quote '
    text = text.replace("\u00b1", "$\\pm$")        # ±
    text = text.replace("\u2248", "$\\approx$")    # ≈
    text = text.replace("\u2264", "$\\leq$")       # ≤
    text = text.replace("\u2265", "$\\geq$")       # ≥
    text = text.replace("\u2192", "$\\rightarrow$")  # →
    text = text.replace("\u2190", "$\\leftarrow$")   # ←
    text = text.replace("\u00d7", "$\\times$")     # ×
    text = text.replace("\u2260", "$\\neq$")       # ≠
    text = text.replace("\u2208", "$\\in$")         # ∈
    text = text.replace("\u221e", "$\\infty$")      # ∞

    # BUG-110: Replace Unicode Greek letters with LaTeX math equivalents.
    # These appear when LLMs emit raw Unicode (e.g. "ε-greedy" instead of
    # "$\epsilon$-greedy") and cause fatal pdflatex errors.
    for _uchar, _lcmd in _UNICODE_GREEK_TO_LATEX.items():
        if _uchar in text:
            text = text.replace(_uchar, _lcmd)

    # Protect math and cite from escaping
    protected: list[str] = []

    def _protect(m: re.Match[str]) -> str:
        idx = len(protected)
        protected.append(m.group(0))
        return f"\x00PROT{idx}\x00"

    # Protect inline math: \(...\) and $...$
    text = re.sub(r"\\\(.+?\\\)", _protect, text)
    text = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", _protect, text)

    # Protect display math residuals: \[...\] and $$...$$
    text = re.sub(r"\\\[.+?\\\]", _protect, text, flags=re.DOTALL)
    text = re.sub(r"\$\$.+?\$\$", _protect, text, flags=re.DOTALL)

    # Protect \cite{...} and \textbf etc.
    text = re.sub(r"\\[a-zA-Z]+\{[^}]*\}", _protect, text)

    # BUG-182: Protect already-escaped LaTeX specials from double-escaping.
    # LLMs often pre-escape underscores/etc: e.g. RawObs\_PPO → should stay
    # as \_, not become \\_ which pdflatex interprets as linebreak + subscript.
    text = re.sub(r"\\([#%&_{}])", _protect, text)

    # Protect \(...\) patterns with linebreaks already handled
    # (should be caught above, but safety net)

    # Convert markdown links BEFORE escaping so URLs with _ are preserved.
    # Protect images first so they don't get matched as links.
    text = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", _protect, text)

    def _convert_and_protect_link(m: re.Match[str]) -> str:
        href = f"\\href{{{m.group(2)}}}{{{m.group(1)}}}"
        idx = len(protected)
        protected.append(href)
        return f"\x00PROT{idx}\x00"

    text = _LINK_RE.sub(_convert_and_protect_link, text)

    # Escape special LaTeX characters
    text = _LATEX_SPECIAL.sub(r"\\\1", text)
    text = _LATEX_TILDE.sub(r"\\textasciitilde{}", text)
    text = _LATEX_CARET.sub(r"\\textasciicircum{}", text)
    text = _LATEX_DOLLAR.sub(r"\\$", text)

    # Convert bold **text** → \textbf{text}
    text = _BOLD_RE.sub(r"\\textbf{\1}", text)

    # Convert italic *text* → \textit{text}
    text = _ITALIC_RE.sub(r"\\textit{\1}", text)

    # Convert inline code `text` → \texttt{text}
    text = _INLINE_CODE_RE.sub(r"\\texttt{\1}", text)

    # Links and images were already converted+protected before escaping.

    # Fallback: convert any remaining [cite_key] patterns to \cite{key}
    # This catches citations that were not converted upstream.
    # BUG-32 fix: key pattern must also match author2017keyword style keys
    # (e.g., roijers2017multiobjective, abels2019dynamic)
    _CITE_KEY_PAT = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*"
    text = re.sub(
        rf"\[({_CITE_KEY_PAT}(?:\s*,\s*{_CITE_KEY_PAT})*)\]",
        r"\\cite{\1}",
        text,
    )

    # Restore protected segments in reverse order so that nested
    # markers (e.g. PROT0 inside PROT1's value) are resolved correctly.
    for idx in range(len(protected) - 1, -1, -1):
        text = text.replace(f"\x00PROT{idx}\x00", protected[idx])

    return text


# ---------------------------------------------------------------------------
# Completeness checking (R10-Fix5)
# ---------------------------------------------------------------------------

_EXPECTED_SECTIONS = {
    "introduction",
    "related work",
    "method",
    "experiment",
    "result",
    "discussion",
    "conclusion",
}

_SECTION_ALIASES: dict[str, str] = {
    "methodology": "method",
    "methods": "method",
    "proposed method": "method",
    "approach": "method",
    "experiments": "experiment",
    "experimental setup": "experiment",
    "experimental results": "result",
    "results": "result",
    "results and discussion": "result",
    "results and analysis": "result",
    "discussion and results": "result",
    "conclusions": "conclusion",
    "conclusion and future work": "conclusion",
    "summary": "conclusion",
    "background": "related work",
    "literature review": "related work",
    "prior work": "related work",
}


def check_paper_completeness(sections: list[_Section]) -> list[str]:
    """Check whether a paper contains all expected sections.

    Returns a list of warning strings.  Empty list means the paper
    structure looks complete.
    """
    warnings: list[str] = []

    # Check for valid title — look for any H1/H2 heading that could be a title
    _has_title = any(
        sec.level in (1, 2) and sec.heading_lower not in ("abstract", "introduction",
            "related work", "method", "methods", "methodology", "experiments",
            "results", "discussion", "conclusion", "limitations", "references")
        for sec in sections
    )
    if not _has_title:
        warnings.append(
            "No valid title found in paper. The output may lack proper heading structure."
        )

    found_sections: set[str] = set()
    section_headings: list[str] = []
    for sec in sections:
        if sec.level in (1, 2) and sec.heading:
            heading_lower = sec.heading.strip().lower()
            section_headings.append(heading_lower)
            if heading_lower in _EXPECTED_SECTIONS:
                found_sections.add(heading_lower)
            elif heading_lower in _SECTION_ALIASES:
                found_sections.add(_SECTION_ALIASES[heading_lower])
            else:
                for expected in _EXPECTED_SECTIONS:
                    if expected in heading_lower:
                        found_sections.add(expected)
                        break

    missing = _EXPECTED_SECTIONS - found_sections
    if missing:
        warnings.append(
            f"Missing sections: {', '.join(sorted(missing))}. "
            f"Found: {', '.join(section_headings)}"
        )

    # T2.5: Check for required conference sections (NeurIPS/ICLR mandate Limitations)
    _required_extras = {"limitations"}
    _extra_aliases = {
        "limitation": "limitations",
        "limitations and future work": "limitations",
        "limitations and broader impact": "limitations",
    }
    found_extras: set[str] = set()
    for sec in sections:
        if sec.level in (1, 2) and sec.heading:
            hl = sec.heading.strip().lower()
            if hl in _required_extras:
                found_extras.add(hl)
            elif hl in _extra_aliases:
                found_extras.add(_extra_aliases[hl])
            elif "limitation" in hl:
                found_extras.add("limitations")
    missing_extras = _required_extras - found_extras
    if missing_extras:
        warnings.append(
            f"Missing required sections for NeurIPS/ICLR: "
            f"{', '.join(sorted(missing_extras))}."
        )

    # T1.5: Abstract length and quality checks
    abstract_text = ""
    for sec in sections:
        if sec.heading_lower == "abstract":
            abstract_text = sec.body
            break
    if abstract_text:
        word_count = len(abstract_text.split())
        if word_count > 300:
            warnings.append(
                f"Abstract is {word_count} words (conference limit: 150-250). "
                f"Must be shortened."
            )
        elif word_count < 150:
            warnings.append(
                f"Abstract is only {word_count} words (expected 150-250 for conferences)."
            )
        # Detect raw variable names / metric key dumps
        raw_vars = re.findall(r"\b\w+_\w+/\w+(?:_\w+)*\s*=", abstract_text)
        if raw_vars:
            warnings.append(
                f"Abstract contains raw variable names: {raw_vars[:3]}. "
                f"Replace with human-readable descriptions."
            )

    # Detect truncation markers
    all_body = " ".join(sec.body for sec in sections)
    truncation_markers = [
        "further sections continue",
        "remaining sections unchanged",
        "sections continue unchanged",
        "content continues",
        "[to be continued]",
        "[remaining content]",
    ]
    for marker in truncation_markers:
        if marker in all_body.lower():
            warnings.append(
                f"Truncation marker detected: '{marker}'. "
                f"Paper content may be incomplete."
            )

    # Word count check
    total_words = sum(len(sec.body.split()) for sec in sections)
    if total_words < 2000:
        warnings.append(
            f"Paper body is only {total_words} words "
            f"(expected 5,000-6,500 for conference paper). "
            f"Content may be severely truncated."
        )


    # Per-section word count check (safety net during LaTeX conversion)
    from researchclaw.prompts import SECTION_WORD_TARGETS, _SECTION_TARGET_ALIASES

    for sec in sections:
        if sec.level not in (1, 2) or not sec.heading:
            continue
        canon = sec.heading_lower
        if canon not in SECTION_WORD_TARGETS:
            canon = _SECTION_TARGET_ALIASES.get(sec.heading_lower, "")
        if not canon or canon not in SECTION_WORD_TARGETS:
            continue
        lo, hi = SECTION_WORD_TARGETS[canon]
        wc = len(sec.body.split())
        if wc < int(lo * 0.6):
            warnings.append(
                f"Section '{sec.heading}' is only {wc} words "
                f"(expected {lo}-{hi}). Content may be severely truncated."
            )
        elif wc > int(hi * 1.5):
            warnings.append(
                f"Section '{sec.heading}' is {wc} words "
                f"(expected {lo}-{hi}). Consider trimming."
            )

    # Bullet density check for body sections
    _bullet_re_cc = re.compile(r"^\s*[-*]\s+", re.MULTILINE)
    _numbered_re_cc = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)
    _bullet_ok_sections = {"introduction", "limitations", "limitation", "abstract"}
    for sec in sections:
        if sec.level not in (1, 2) or not sec.heading:
            continue
        hl = sec.heading_lower
        if hl in _bullet_ok_sections:
            continue
        if not sec.body:
            continue
        total_lines = len([ln for ln in sec.body.splitlines() if ln.strip()])
        if total_lines < 4:
            continue
        bullet_count = (
            len(_bullet_re_cc.findall(sec.body))
            + len(_numbered_re_cc.findall(sec.body))
        )
        density = bullet_count / total_lines
        if density > 0.30:
            warnings.append(
                f"Section '{sec.heading}' has high bullet-point density "
                f"({bullet_count}/{total_lines} lines = {density:.0%}). "
                f"Conference papers should use flowing prose."
            )

    return warnings


def _escape_latex(text: str) -> str:
    """Escape LaTeX special characters in plain text (titles, headings).

    Does NOT escape inside math delimiters or \\commands.
    """
    # Protect math first
    protected: list[str] = []

    def _protect(m: re.Match[str]) -> str:
        idx = len(protected)
        protected.append(m.group(0))
        return f"\x00PROT{idx}\x00"

    text = re.sub(r"\\\(.+?\\\)", _protect, text)
    text = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", _protect, text)
    text = re.sub(r"\\[a-zA-Z]+\{[^}]*\}", _protect, text)

    text = _LATEX_SPECIAL.sub(r"\\\1", text)
    text = text.replace("~", "\\textasciitilde{}")
    text = text.replace("^", "\\textasciicircum{}")

    for idx, val in enumerate(protected):
        text = text.replace(f"\x00PROT{idx}\x00", val)

    return text


================================================
FILE: researchclaw/templates/results_table_builder.py
================================================
"""Pre-build LaTeX results tables from experiment data.

Generates verified, ready-to-embed LaTeX tables directly from
``experiment_summary.json``.  The LLM receives these tables as
verbatim blocks and is instructed NOT to modify the numbers.

This removes the LLM from the number-generation loop entirely.
"""

from __future__ import annotations

import logging
import math
from dataclasses import dataclass, field
from typing import Any

from researchclaw.pipeline.verified_registry import ConditionResult, VerifiedRegistry

logger = logging.getLogger(__name__)


@dataclass
class LatexTable:
    """A single pre-built LaTeX table."""

    label: str  # e.g. "tab:main_results"
    caption: str
    latex_code: str  # Complete \begin{table}...\end{table}
    verified_values: set[float] = field(default_factory=set)
    n_conditions: int = 0
    n_total_seeds: int = 0


def build_results_tables(
    registry: VerifiedRegistry,
    *,
    metric_name: str = "Metric",
    metric_direction: str = "maximize",
    two_column: bool = False,
) -> list[LatexTable]:
    """Generate LaTeX tables from a VerifiedRegistry.

    Parameters
    ----------
    registry:
        The verified registry built from experiment data.
    metric_name:
        Human-readable name for the primary metric column.
    metric_direction:
        ``"maximize"`` or ``"minimize"`` — determines which result is bolded.
    two_column:
        If True, use ``table*`` environment (for 2-column formats like ICML).

    Returns
    -------
    list[LatexTable]
        One or more tables.  Usually just one main results table.
    """
    tables: list[LatexTable] = []

    # --- Main results table ---
    conditions = _get_reportable_conditions(registry)
    if not conditions:
        logger.warning("No reportable conditions — skipping table generation")
        return tables

    main_table = _build_main_table(
        conditions,
        metric_name=metric_name,
        metric_direction=metric_direction,
        two_column=two_column,
    )
    tables.append(main_table)

    # --- Per-seed breakdown table (if seeds > 1 for any condition) ---
    has_multi_seed = any(c.n_seeds >= 2 for c in conditions)
    if has_multi_seed:
        seed_table = _build_per_seed_table(
            conditions,
            metric_name=metric_name,
            two_column=two_column,
        )
        tables.append(seed_table)

    return tables


def _get_reportable_conditions(registry: VerifiedRegistry) -> list[ConditionResult]:
    """Filter conditions to only those with at least 1 valid seed."""
    results = []
    for cond in registry.conditions.values():
        if cond.n_seeds >= 1 and cond.mean is not None and math.isfinite(cond.mean):
            results.append(cond)
    # Sort alphabetically for consistency
    results.sort(key=lambda c: c.name)
    return results


def _build_main_table(
    conditions: list[ConditionResult],
    *,
    metric_name: str,
    metric_direction: str,
    two_column: bool,
) -> LatexTable:
    """Build the main results table with mean ± std per condition."""
    verified: set[float] = set()

    # Find best condition for bolding
    best_idx = _find_best(conditions, metric_direction)

    # Build rows
    rows: list[str] = []
    for i, cond in enumerate(conditions):
        mean_str = _fmt(cond.mean)
        if cond.mean is not None:
            verified.add(round(cond.mean, 4))

        if cond.std is not None and cond.std > 0 and cond.n_seeds >= 2:
            std_str = _fmt(cond.std)
            val_str = f"{mean_str} $\\pm$ {std_str}"
            verified.add(round(cond.std, 4))
        elif cond.n_seeds == 1:
            val_str = f"{mean_str}$^{{\\ddagger}}$"
        else:
            val_str = mean_str

        if i == best_idx:
            val_str = f"\\textbf{{{val_str}}}"

        n_str = str(cond.n_seeds)
        name_escaped = _escape_latex(cond.name)
        rows.append(f"{name_escaped} & {val_str} & {n_str} \\\\")

    # Compose table
    table_env = "table*" if two_column else "table"
    col_spec = "l c r"

    body = "\n".join(rows)
    note_lines = []
    if any(c.n_seeds == 1 for c in conditions):
        note_lines.append(
            "$^{\\ddagger}$Single seed; no standard deviation available."
        )

    notes = "\n".join(note_lines)
    if notes:
        notes = f"\n\\vspace{{2pt}}\\par\\footnotesize {notes}\n"

    latex = (
        f"\\begin{{{table_env}}}[htbp]\n"
        f"\\centering\n"
        f"\\caption{{Experimental results. "
        f"{len(conditions)} conditions evaluated.}}\n"
        f"\\label{{tab:main_results}}\n"
        f"% AUTO-GENERATED FROM EXPERIMENT DATA — DO NOT MODIFY NUMBERS\n"
        f"\\begin{{tabular}}{{{col_spec}}}\n"
        f"\\toprule\n"
        f"Method & {metric_name} & $n$ \\\\\n"
        f"\\midrule\n"
        f"{body}\n"
        f"\\bottomrule\n"
        f"\\end{{tabular}}{notes}\n"
        f"\\end{{{table_env}}}"
    )

    return LatexTable(
        label="tab:main_results",
        caption=f"Experimental results. {len(conditions)} conditions evaluated.",
        latex_code=latex,
        verified_values=verified,
        n_conditions=len(conditions),
        n_total_seeds=sum(c.n_seeds for c in conditions),
    )


def _build_per_seed_table(
    conditions: list[ConditionResult],
    *,
    metric_name: str,
    two_column: bool,
) -> LatexTable:
    """Build per-seed breakdown table."""
    verified: set[float] = set()

    # Determine max seeds across conditions
    max_seeds = max(c.n_seeds for c in conditions)

    # Build header
    seed_cols = " & ".join(f"Seed {i}" for i in range(max_seeds))
    col_spec = "l " + " ".join("r" for _ in range(max_seeds)) + " r"

    # Build rows
    rows: list[str] = []
    for cond in conditions:
        name_escaped = _escape_latex(cond.name)
        cells = []
        for seed_idx in range(max_seeds):
            val = cond.per_seed_values.get(seed_idx)
            if val is not None and math.isfinite(val):
                cells.append(_fmt(val))
                verified.add(round(val, 4))
            else:
                cells.append("---")
        mean_str = _fmt(cond.mean) if cond.mean is not None else "---"
        cells_str = " & ".join(cells)
        rows.append(f"{name_escaped} & {cells_str} & {mean_str} \\\\")

    body = "\n".join(rows)
    table_env = "table*" if two_column else "table"

    latex = (
        f"\\begin{{{table_env}}}[htbp]\n"
        f"\\centering\n"
        f"\\caption{{Per-seed results breakdown.}}\n"
        f"\\label{{tab:per_seed}}\n"
        f"% AUTO-GENERATED FROM EXPERIMENT DATA — DO NOT MODIFY NUMBERS\n"
        f"\\begin{{tabular}}{{{col_spec}}}\n"
        f"\\toprule\n"
        f"Method & {seed_cols} & Mean \\\\\n"
        f"\\midrule\n"
        f"{body}\n"
        f"\\bottomrule\n"
        f"\\end{{tabular}}\n"
        f"\\end{{{table_env}}}"
    )

    return LatexTable(
        label="tab:per_seed",
        caption="Per-seed results breakdown.",
        latex_code=latex,
        verified_values=verified,
        n_conditions=len(conditions),
        n_total_seeds=sum(c.n_seeds for c in conditions),
    )


def build_condition_whitelist(registry: VerifiedRegistry) -> str:
    """Generate a human-readable condition whitelist for the LLM prompt.

    Example output::

        CONDITION WHITELIST (you may ONLY discuss these conditions):
        - DQN (3 seeds, mean=206.10)
        - DQN+Abstraction (3 seeds, mean=278.93)
        - DQN+RawCount (3 seeds, mean=180.80)
    """
    lines = ["CONDITION WHITELIST (you may ONLY discuss these conditions):"]
    for cond in sorted(registry.conditions.values(), key=lambda c: c.name):
        if cond.n_seeds == 0 or cond.mean is None or not math.isfinite(cond.mean):
            continue
        mean_str = f"{cond.mean:.4f}"
        lines.append(f"- {cond.name} ({cond.n_seeds} seed(s), mean={mean_str})")

    if len(lines) == 1:
        lines.append("- (no conditions completed)")

    return "\n".join(lines)


def _find_best(conditions: list[ConditionResult], direction: str) -> int | None:
    """Return index of best condition, or None if empty."""
    if not conditions:
        return None
    best_idx = 0
    for i, c in enumerate(conditions):
        if c.mean is None:
            continue
        if conditions[best_idx].mean is None:
            best_idx = i
            continue
        if direction == "maximize" and c.mean > conditions[best_idx].mean:
            best_idx = i
        elif direction == "minimize" and c.mean < conditions[best_idx].mean:
            best_idx = i
    return best_idx


def _fmt(value: float | None) -> str:
    """Format a number for LaTeX tables with sig-fig-aware rounding."""
    if value is None or not math.isfinite(value):
        return "---"
    # Sig-fig-aware formatting (same approach as BUG-83 fix)
    av = abs(value)
    if av >= 100:
        return f"{value:.2f}"
    elif av >= 1:
        return f"{value:.4f}"
    elif av >= 0.001:
        return f"{value:.4f}"
    elif av > 0:
        # Very small values: use 2 significant figures
        import decimal
        d = decimal.Decimal(str(value)).normalize()
        # Count leading zeros after decimal point
        exp = d.adjusted()
        sig_digits = max(2, -exp + 1)
        return f"{value:.{sig_digits}f}"
    else:
        return "0.0000"


def _escape_latex(text: str) -> str:
    """Escape special LaTeX characters in condition names."""
    # Backslash must be first to avoid double-escaping
    replacements = [
        ("\\", "\\textbackslash{}"),
        ("&", "\\&"),
        ("%", "\\%"),
        ("#", "\\#"),
        ("_", "\\_"),
        ("$", "\\$"),
        ("{", "\\{"),
        ("}", "\\}"),
        ("~", "\\textasciitilde{}"),
        ("^", "\\textasciicircum{}"),
    ]
    for old, new in replacements:
        text = text.replace(old, new)
    return text


================================================
FILE: researchclaw/templates/styles/iclr_2025/iclr2025_conference.bst
================================================
%% iclr2025_conference.bst — ICLR 2025 bibliography style
%% Symlink-equivalent to iclr2026_conference.bst (same format).
%% Bundled by AutoResearchClaw for offline compilation.

ENTRY
  { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass }
  {}
  { label }

INTEGERS { output.state before.all mid.sentence after.sentence after.block }

FUNCTION {init.state.consts}
{ #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := }

STRINGS { s t }

FUNCTION {output.nonnull}
{ 's :=
  output.state mid.sentence =
    { ", " * write$ }
    { output.state after.block =
        { add.period$ write$ newline$ "\newblock " write$ }
        { output.state before.all = 'write$ { add.period$ " " * write$ } if$ }
      if$
      mid.sentence 'output.state :=
    }
  if$
  s
}

FUNCTION {output}
{ duplicate$ empty$ 'pop$ 'output.nonnull if$ }

FUNCTION {output.check}
{ 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ }

FUNCTION {fin.entry} { add.period$ write$ newline$ }

FUNCTION {new.block}
{ output.state before.all = 'skip$ { after.block 'output.state := } if$ }

FUNCTION {not} { { #0 } { #1 } if$ }
FUNCTION {and} { 'skip$ { pop$ #0 } if$ }
FUNCTION {or} { { pop$ #1 } 'skip$ if$ }
FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ }
FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ }

INTEGERS { nameptr namesleft numnames }

FUNCTION {format.names}
{ 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft :=
    { namesleft #0 > }
    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
      nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$
      nameptr #1 + 'nameptr := namesleft #1 - 'namesleft :=
    }
  while$
}

FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ }
FUNCTION {format.title} { title empty$ { "" } { title } if$ }
FUNCTION {format.btitle} { title emphasize }
FUNCTION {format.date} { year empty$ { "" } { year } if$ }
FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ }
FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ }
FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ }

FUNCTION {output.bibitem}
{ newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := }

FUNCTION {article}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {inproceedings}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {conference} { inproceedings }
FUNCTION {book}
{ output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry }

FUNCTION {misc}
{ output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry }

FUNCTION {techreport} { misc }
FUNCTION {phdthesis} { misc }
FUNCTION {mastersthesis} { misc }
FUNCTION {unpublished} { misc }
FUNCTION {default.type} { misc }

READ
FUNCTION {sortify} { purify$ "l" change.case$ }
FUNCTION {presort} { cite$ 'label := label sortify "    " * #1 entry.max$ substring$ 'sort.key$ := }
ITERATE {presort}
SORT
FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ }
FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ }
EXECUTE {begin.bib}
EXECUTE {init.state.consts}
ITERATE {call.type$}
EXECUTE {end.bib}


================================================
FILE: researchclaw/templates/styles/iclr_2025/iclr2025_conference.sty
================================================
% iclr2025_conference.sty — ICLR 2025 conference style file
% Based on the official ICLR submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://github.com/ICLR/Master-Template/raw/master/iclr2025.zip

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{iclr2025_conference}[2025/01/15 ICLR 2025 conference style]

\newif\if@iclr@final      \@iclr@finalfalse
\newif\if@iclr@preprint   \@iclr@preprintfalse
\DeclareOption{final}{\@iclr@finaltrue}
\DeclareOption{preprint}{\@iclr@preprinttrue}
\ProcessOptions\relax

\RequirePackage{geometry}
\geometry{textwidth=5.5in,textheight=9.0in,top=1.0in,headheight=12pt,headsep=25pt,footskip=30pt}
\RequirePackage{times}
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

\renewcommand{\section}{\@startsection{section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}{1.0ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}

\def\@maketitle{%
  \vbox to 0pt{}\vskip -0.5in
  \begin{center}%
    {\LARGE\bfseries \@title \par}\vskip 0.3in
    \if@iclr@final
      {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}%
    \else
      {\large Anonymous authors\par}{\normalsize Paper under double-blind review\par}%
    \fi
    \vskip 0.3in
  \end{center}\par\vskip 0.5em
}

\renewenvironment{abstract}{\centerline{\large\bfseries Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}

\RequirePackage{fancyhdr}
\pagestyle{fancy}\fancyhf{}
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

\RequirePackage[numbers,sort&compress]{natbib}

\endinput


================================================
FILE: researchclaw/templates/styles/iclr_2026/iclr2026_conference.bst
================================================
%% iclr2026_conference.bst — ICLR 2026 bibliography style
%% Bundled by AutoResearchClaw for offline compilation.
%% This is a minimal numeric bibliography style compatible with natbib.
%% For full-fidelity formatting, download from https://github.com/ICLR/Master-Template

ENTRY
  { author
    title
    journal
    booktitle
    year
    volume
    number
    pages
    doi
    url
    note
    publisher
    address
    edition
    eprint
    archiveprefix
    primaryclass
  }
  {}
  { label }

INTEGERS { output.state before.all mid.sentence after.sentence after.block }

FUNCTION {init.state.consts}
{ #0 'before.all :=
  #1 'mid.sentence :=
  #2 'after.sentence :=
  #3 'after.block :=
}

STRINGS { s t }

FUNCTION {output.nonnull}
{ 's :=
  output.state mid.sentence =
    { ", " * write$ }
    { output.state after.block =
        { add.period$ write$
          newline$
          "\newblock " write$
        }
        { output.state before.all =
            'write$
            { add.period$ " " * write$ }
          if$
        }
      if$
      mid.sentence 'output.state :=
    }
  if$
  s
}

FUNCTION {output}
{ duplicate$ empty$
    'pop$
    'output.nonnull
  if$
}

FUNCTION {output.check}
{ 't :=
  duplicate$ empty$
    { pop$ "empty " t * " in " * cite$ * warning$ }
    'output.nonnull
  if$
}

FUNCTION {fin.entry}
{ add.period$
  write$
  newline$
}

FUNCTION {new.block}
{ output.state before.all =
    'skip$
    { after.block 'output.state := }
  if$
}

FUNCTION {not}
{   { #0 }
    { #1 }
  if$
}

FUNCTION {and}
{   'skip$
    { pop$ #0 }
  if$
}

FUNCTION {or}
{   { pop$ #1 }
    'skip$
  if$
}

FUNCTION {field.or.null}
{ duplicate$ empty$
    { pop$ "" }
    'skip$
  if$
}

FUNCTION {emphasize}
{ duplicate$ empty$
    { pop$ "" }
    { "\emph{" swap$ * "}" * }
  if$
}

INTEGERS { nameptr namesleft numnames }

FUNCTION {format.names}
{ 's :=
  #1 'nameptr :=
  s num.names$ 'numnames :=
  numnames 'namesleft :=
    { namesleft #0 > }
    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
      nameptr #1 >
        { namesleft #1 >
            { ", " * t * }
            { numnames #2 >
                { "," * }
                'skip$
              if$
              t "others" =
                { " et~al." * }
                { " and " * t * }
              if$
            }
          if$
        }
        't
      if$
      nameptr #1 + 'nameptr :=
      namesleft #1 - 'namesleft :=
    }
  while$
}

FUNCTION {format.authors}
{ author empty$
    { "" }
    { author format.names }
  if$
}

FUNCTION {format.title}
{ title empty$
    { "" }
    { title }
  if$
}

FUNCTION {format.btitle}
{ title emphasize
}

FUNCTION {format.date}
{ year empty$
    { "" }
    { year }
  if$
}

FUNCTION {format.bvolume}
{ volume empty$
    { "" }
    { "volume " volume * }
  if$
}

FUNCTION {format.pages}
{ pages empty$
    { "" }
    { "pp. " pages * }
  if$
}

FUNCTION {format.url}
{ url empty$
    { "" }
    { "\url{" url * "}" * }
  if$
}

FUNCTION {output.bibitem}
{ newline$
  "\bibitem{" write$
  cite$ write$
  "}" write$
  newline$
  ""
  before.all 'output.state :=
}

FUNCTION {article}
{ output.bibitem
  format.authors "author" output.check
  new.block
  format.title "title" output.check
  new.block
  journal emphasize "journal" output.check
  format.bvolume output
  format.pages output
  format.date "year" output.check
  format.url output
  fin.entry
}

FUNCTION {inproceedings}
{ output.bibitem
  format.authors "author" output.check
  new.block
  format.title "title" output.check
  new.block
  "In " booktitle emphasize * output
  format.pages output
  format.date "year" output.check
  format.url output
  fin.entry
}

FUNCTION {conference} { inproceedings }

FUNCTION {book}
{ output.bibitem
  format.authors "author" output.check
  new.block
  format.btitle "title" output.check
  publisher output
  format.date "year" output.check
  fin.entry
}

FUNCTION {misc}
{ output.bibitem
  format.authors output
  new.block
  format.title output
  new.block
  note output
  format.date output
  format.url output
  fin.entry
}

FUNCTION {techreport} { misc }
FUNCTION {phdthesis} { misc }
FUNCTION {mastersthesis} { misc }
FUNCTION {unpublished} { misc }

FUNCTION {default.type} { misc }

READ
FUNCTION {sortify}
{ purify$
  "l" change.case$
}

FUNCTION {presort}
{ cite$ 'label :=
  label sortify
  "    "
  *
  #1 entry.max$ substring$
  'sort.key$ :=
}

ITERATE {presort}
SORT

FUNCTION {begin.bib}
{ preamble$ empty$
    'skip$
    { preamble$ write$ newline$ }
  if$
  "\begin{thebibliography}{99}" write$ newline$
}

FUNCTION {end.bib}
{ newline$
  "\end{thebibliography}" write$ newline$
}

EXECUTE {begin.bib}
EXECUTE {init.state.consts}
ITERATE {call.type$}
EXECUTE {end.bib}


================================================
FILE: researchclaw/templates/styles/iclr_2026/iclr2026_conference.sty
================================================
% iclr2026_conference.sty — ICLR 2026 conference style file
% Based on the official ICLR submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://github.com/ICLR/Master-Template

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{iclr2026_conference}[2026/01/15 ICLR 2026 conference style]

% ── Options ──────────────────────────────────────────────────────────
\newif\if@iclr@final      \@iclr@finalfalse
\newif\if@iclr@preprint   \@iclr@preprintfalse

\DeclareOption{final}{\@iclr@finaltrue}
\DeclareOption{preprint}{\@iclr@preprinttrue}
\ProcessOptions\relax

% ── Page geometry ────────────────────────────────────────────────────
\RequirePackage{geometry}
\geometry{
  textwidth=5.5in,
  textheight=9.0in,
  top=1.0in,
  headheight=12pt,
  headsep=25pt,
  footskip=30pt,
}

% ── Fonts ────────────────────────────────────────────────────────────
\RequirePackage{times}

% ── Spacing ──────────────────────────────────────────────────────────
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

% ── Section formatting ───────────────────────────────────────────────
\renewcommand{\section}{\@startsection
  {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}%
  {1.0ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection
  {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}%
  {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection
  {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}%
  {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}

% ── Title formatting ────────────────────────────────────────────────
\def\@maketitle{%
  \vbox to 0pt{}%
  \vskip -0.5in
  \begin{center}%
    {\LARGE\bfseries \@title \par}%
    \vskip 0.3in
    \if@iclr@final
      {\large
        \lineskip .5em
        \begin{tabular}[t]{c}%
          \@author
        \end{tabular}\par}%
    \else
      {\large Anonymous authors\par}%
      {\normalsize Paper under double-blind review\par}%
    \fi
    \vskip 0.3in
  \end{center}%
  \par
  \vskip 0.5em
}

% ── Abstract ─────────────────────────────────────────────────────────
\renewenvironment{abstract}{%
  \centerline{\large\bfseries Abstract}%
  \vspace{0.5ex}%
  \begin{quote}%
}{%
  \par
  \end{quote}%
  \vskip 1ex
}

% ── Headers ──────────────────────────────────────────────────────────
\RequirePackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\if@iclr@final
  \fancyhead[C]{Published as a conference paper at ICLR 2026}
\else
  \fancyhead[C]{}
\fi
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

% ── Natbib ───────────────────────────────────────────────────────────
\RequirePackage[numbers,sort&compress]{natbib}

\endinput


================================================
FILE: researchclaw/templates/styles/icml_2025/icml2025.bst
================================================
%% icml2025.bst — ICML 2025 bibliography style
%% Bundled by AutoResearchClaw for offline compilation.
%% Identical format to icml2026.bst.

ENTRY
  { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass }
  {}
  { label }

INTEGERS { output.state before.all mid.sentence after.sentence after.block }

FUNCTION {init.state.consts}
{ #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := }

STRINGS { s t }

FUNCTION {output.nonnull}
{ 's :=
  output.state mid.sentence =
    { ", " * write$ }
    { output.state after.block =
        { add.period$ write$ newline$ "\newblock " write$ }
        { output.state before.all = 'write$ { add.period$ " " * write$ } if$ }
      if$
      mid.sentence 'output.state :=
    }
  if$
  s
}

FUNCTION {output}
{ duplicate$ empty$ 'pop$ 'output.nonnull if$ }

FUNCTION {output.check}
{ 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ }

FUNCTION {fin.entry} { add.period$ write$ newline$ }

FUNCTION {new.block}
{ output.state before.all = 'skip$ { after.block 'output.state := } if$ }

FUNCTION {not} { { #0 } { #1 } if$ }
FUNCTION {and} { 'skip$ { pop$ #0 } if$ }
FUNCTION {or} { { pop$ #1 } 'skip$ if$ }
FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ }
FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ }

INTEGERS { nameptr namesleft numnames }

FUNCTION {format.names}
{ 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft :=
    { namesleft #0 > }
    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
      nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$
      nameptr #1 + 'nameptr := namesleft #1 - 'namesleft :=
    }
  while$
}

FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ }
FUNCTION {format.title} { title empty$ { "" } { title } if$ }
FUNCTION {format.btitle} { title emphasize }
FUNCTION {format.date} { year empty$ { "" } { year } if$ }
FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ }
FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ }
FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ }

FUNCTION {output.bibitem}
{ newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := }

FUNCTION {article}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {inproceedings}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {conference} { inproceedings }
FUNCTION {book}
{ output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry }

FUNCTION {misc}
{ output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry }

FUNCTION {techreport} { misc }
FUNCTION {phdthesis} { misc }
FUNCTION {mastersthesis} { misc }
FUNCTION {unpublished} { misc }
FUNCTION {default.type} { misc }

READ
FUNCTION {sortify} { purify$ "l" change.case$ }
FUNCTION {presort} { cite$ 'label := label sortify "    " * #1 entry.max$ substring$ 'sort.key$ := }
ITERATE {presort}
SORT
FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ }
FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ }
EXECUTE {begin.bib}
EXECUTE {init.state.consts}
ITERATE {call.type$}
EXECUTE {end.bib}


================================================
FILE: researchclaw/templates/styles/icml_2025/icml2025.sty
================================================
% icml2025.sty — ICML 2025 style file
% Based on the official ICML submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://icml.cc/Conferences/2025/StyleAuthorInstructions

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{icml2025}[2025/01/15 ICML 2025 submission style]

\newif\if@icml@accepted  \@icml@acceptedfalse
\newif\if@icml@preprint  \@icml@preprintfalse
\DeclareOption{accepted}{\@icml@acceptedtrue}
\DeclareOption{preprint}{\@icml@preprinttrue}
\ProcessOptions\relax

\RequirePackage{geometry}
\geometry{textwidth=6.875in,textheight=9.25in,columnsep=0.25in,top=0.75in,headheight=12pt,headsep=12pt,footskip=20pt}
\twocolumn

\RequirePackage{times}
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

\renewcommand{\section}{\@startsection{section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}{0.3ex plus .2ex}{\normalfont\normalsize\bfseries}}

\newenvironment{icmlauthorlist}{\begin{center}\large}{\end{center}}
\newcommand{\icmlauthor}[2]{#1\textsuperscript{#2}}
\newcommand{\icmlaffiliation}[2]{\par\normalsize\textsuperscript{#1}#2}
\newcommand{\icmltitlerunning}[1]{\def\@icml@runningtitle{#1}}
\def\@icml@runningtitle{}

\def\@maketitle{%
  \twocolumn[%
    \vskip -0.3in
    \begin{center}%
      {\LARGE\bfseries \@title \par}\vskip 0.2in
      \if@icml@accepted
        {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}%
      \else\if@icml@preprint
        {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}%
      \else
        {\large Anonymous submission\par}%
      \fi\fi
      \vskip 0.2in
    \end{center}%
  ]%
}

\renewenvironment{abstract}{\centerline{\bfseries Abstract}\vspace{0.5ex}\begin{quote}\small}{\par\end{quote}\vskip 1ex}

\RequirePackage{fancyhdr}
\pagestyle{fancy}\fancyhf{}
\if@icml@accepted
  \fancyhead[C]{\small Proceedings of the $42^{nd}$ International Conference on Machine Learning, 2025}
\else
  \fancyhead[C]{\small\@icml@runningtitle}
\fi
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

\RequirePackage[numbers,sort&compress]{natbib}

\endinput


================================================
FILE: researchclaw/templates/styles/icml_2026/icml2026.bst
================================================
%% icml2026.bst — ICML 2026 bibliography style
%% Bundled by AutoResearchClaw for offline compilation.
%% Minimal numeric bibliography style compatible with natbib.

ENTRY
  { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass }
  {}
  { label }

INTEGERS { output.state before.all mid.sentence after.sentence after.block }

FUNCTION {init.state.consts}
{ #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := }

STRINGS { s t }

FUNCTION {output.nonnull}
{ 's :=
  output.state mid.sentence =
    { ", " * write$ }
    { output.state after.block =
        { add.period$ write$ newline$ "\newblock " write$ }
        { output.state before.all = 'write$ { add.period$ " " * write$ } if$ }
      if$
      mid.sentence 'output.state :=
    }
  if$
  s
}

FUNCTION {output}
{ duplicate$ empty$ 'pop$ 'output.nonnull if$ }

FUNCTION {output.check}
{ 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ }

FUNCTION {fin.entry} { add.period$ write$ newline$ }

FUNCTION {new.block}
{ output.state before.all = 'skip$ { after.block 'output.state := } if$ }

FUNCTION {not} { { #0 } { #1 } if$ }
FUNCTION {and} { 'skip$ { pop$ #0 } if$ }
FUNCTION {or} { { pop$ #1 } 'skip$ if$ }
FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ }
FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ }

INTEGERS { nameptr namesleft numnames }

FUNCTION {format.names}
{ 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft :=
    { namesleft #0 > }
    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
      nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$
      nameptr #1 + 'nameptr := namesleft #1 - 'namesleft :=
    }
  while$
}

FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ }
FUNCTION {format.title} { title empty$ { "" } { title } if$ }
FUNCTION {format.btitle} { title emphasize }
FUNCTION {format.date} { year empty$ { "" } { year } if$ }
FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ }
FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ }
FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ }

FUNCTION {output.bibitem}
{ newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := }

FUNCTION {article}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {inproceedings}
{ output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry }

FUNCTION {conference} { inproceedings }
FUNCTION {book}
{ output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry }

FUNCTION {misc}
{ output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry }

FUNCTION {techreport} { misc }
FUNCTION {phdthesis} { misc }
FUNCTION {mastersthesis} { misc }
FUNCTION {unpublished} { misc }
FUNCTION {default.type} { misc }

READ
FUNCTION {sortify} { purify$ "l" change.case$ }
FUNCTION {presort} { cite$ 'label := label sortify "    " * #1 entry.max$ substring$ 'sort.key$ := }
ITERATE {presort}
SORT
FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ }
FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ }
EXECUTE {begin.bib}
EXECUTE {init.state.consts}
ITERATE {call.type$}
EXECUTE {end.bib}


================================================
FILE: researchclaw/templates/styles/icml_2026/icml2026.sty
================================================
% icml2026.sty — ICML 2026 style file
% Based on the official ICML submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://icml.cc/Conferences/2026/AuthorInstructions

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{icml2026}[2026/01/15 ICML 2026 submission style]

% ── Options ──────────────────────────────────────────────────────────
\newif\if@icml@accepted  \@icml@acceptedfalse
\newif\if@icml@preprint  \@icml@preprintfalse

\DeclareOption{accepted}{\@icml@acceptedtrue}
\DeclareOption{preprint}{\@icml@preprinttrue}
\ProcessOptions\relax

% ── Page geometry (2-column) ─────────────────────────────────────────
\RequirePackage{geometry}
\geometry{
  textwidth=6.875in,
  textheight=9.25in,
  columnsep=0.25in,
  top=0.75in,
  headheight=12pt,
  headsep=12pt,
  footskip=20pt,
}
\twocolumn

% ── Fonts ────────────────────────────────────────────────────────────
\RequirePackage{times}

% ── Spacing ──────────────────────────────────────────────────────────
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

% ── Section formatting ───────────────────────────────────────────────
\renewcommand{\section}{\@startsection
  {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}%
  {0.8ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection
  {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}%
  {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection
  {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}%
  {0.3ex plus .2ex}{\normalfont\normalsize\bfseries}}

% ── Title formatting ────────────────────────────────────────────────
% ICML-specific author macros
\newenvironment{icmlauthorlist}{\begin{center}\large}{\end{center}}
\newcommand{\icmlauthor}[2]{#1\textsuperscript{#2}}
\newcommand{\icmlaffiliation}[2]{\par\normalsize\textsuperscript{#1}#2}
\newcommand{\icmltitlerunning}[1]{\def\@icml@runningtitle{#1}}
\def\@icml@runningtitle{}

\def\@maketitle{%
  \twocolumn[%
    \vskip -0.3in
    \begin{center}%
      {\LARGE\bfseries \@title \par}%
      \vskip 0.2in
      \if@icml@accepted
        {\large
          \lineskip .5em
          \begin{tabular}[t]{c}%
            \@author
          \end{tabular}\par}%
      \else
        \if@icml@preprint
          {\large
            \lineskip .5em
            \begin{tabular}[t]{c}%
              \@author
            \end{tabular}\par}%
        \else
          {\large Anonymous submission\par}%
        \fi
      \fi
      \vskip 0.2in
    \end{center}%
  ]%
}

% ── Abstract ─────────────────────────────────────────────────────────
\renewenvironment{abstract}{%
  \centerline{\bfseries Abstract}%
  \vspace{0.5ex}%
  \begin{quote}\small%
}{%
  \par
  \end{quote}%
  \vskip 1ex
}

% ── Headers ──────────────────────────────────────────────────────────
\RequirePackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\if@icml@accepted
  \fancyhead[C]{\small Proceedings of the $43^{rd}$ International Conference on Machine Learning, 2026}
\else
  \fancyhead[C]{\small\@icml@runningtitle}
\fi
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

% ── Natbib ───────────────────────────────────────────────────────────
\RequirePackage[numbers,sort&compress]{natbib}

\endinput


================================================
FILE: researchclaw/templates/styles/neurips_2024/neurips_2024.sty
================================================
% neurips_2024.sty — NeurIPS 2024 style file
% Based on the official NeurIPS submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://media.neurips.cc/Conferences/NeurIPS2024/Styles.zip

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{neurips_2024}[2024/01/15 NeurIPS 2024 submission style]

% ── Options ──────────────────────────────────────────────────────────
\newif\if@neurips@preprint   \@neurips@preprinttrue
\newif\if@neurips@final      \@neurips@finalfalse
\newif\if@neurips@nonatbib   \@neurips@nonatbibfalse

\DeclareOption{preprint}{\@neurips@preprinttrue\@neurips@finalfalse}
\DeclareOption{final}{\@neurips@finaltrue\@neurips@preprintfalse}
\DeclareOption{nonatbib}{\@neurips@nonatbibtrue}
\ProcessOptions\relax

% ── Page geometry ────────────────────────────────────────────────────
\RequirePackage{geometry}
\geometry{
  textwidth=6.0in,
  textheight=9.0in,
  top=1.0in,
  headheight=12pt,
  headsep=25pt,
  footskip=30pt,
}

% ── Fonts ────────────────────────────────────────────────────────────
\RequirePackage{times}

% ── Spacing ──────────────────────────────────────────────────────────
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

% ── Section formatting ───────────────────────────────────────────────
\renewcommand{\section}{\@startsection
  {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}%
  {1.0ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection
  {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}%
  {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection
  {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}%
  {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}

% ── Title formatting ────────────────────────────────────────────────
\def\@maketitle{%
  \vbox to 0pt{}%
  \vskip -0.5in
  \begin{center}%
    {\LARGE\bfseries \@title \par}%
    \vskip 0.3in
    \if@neurips@preprint
      {\large\textit{Preprint. Under review.}\par}%
      \vskip 0.1in
    \fi
    {\large
      \lineskip .5em
      \begin{tabular}[t]{c}%
        \@author
      \end{tabular}\par}%
    \vskip 0.3in
  \end{center}%
  \par
  \vskip 0.5em
}

% ── Abstract ─────────────────────────────────────────────────────────
\renewenvironment{abstract}{%
  \centerline{\large\bfseries Abstract}%
  \vspace{0.5ex}%
  \begin{quote}%
}{%
  \par
  \end{quote}%
  \vskip 1ex
}

% ── Headers ──────────────────────────────────────────────────────────
\RequirePackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\fancyhead[C]{}
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

% ── Natbib ───────────────────────────────────────────────────────────
\if@neurips@nonatbib\else
  \RequirePackage[numbers,sort&compress]{natbib}
\fi

\endinput


================================================
FILE: researchclaw/templates/styles/neurips_2025/neurips_2025.sty
================================================
% neurips_2025.sty — NeurIPS 2025 style file
% Based on the official NeurIPS submission template structure.
% Bundled by AutoResearchClaw for offline compilation.
% Official source: https://media.neurips.cc/Conferences/NeurIPS2025/Styles.zip

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{neurips_2025}[2025/01/15 NeurIPS 2025 submission style]

% ── Options ──────────────────────────────────────────────────────────
\newif\if@neurips@preprint   \@neurips@preprinttrue
\newif\if@neurips@final      \@neurips@finalfalse
\newif\if@neurips@nonatbib   \@neurips@nonatbibfalse

\DeclareOption{preprint}{\@neurips@preprinttrue\@neurips@finalfalse}
\DeclareOption{final}{\@neurips@finaltrue\@neurips@preprintfalse}
\DeclareOption{nonatbib}{\@neurips@nonatbibtrue}
\ProcessOptions\relax

% ── Page geometry ────────────────────────────────────────────────────
\RequirePackage{geometry}
\geometry{
  textwidth=6.0in,
  textheight=9.0in,
  top=1.0in,
  headheight=12pt,
  headsep=25pt,
  footskip=30pt,
}

% ── Fonts ────────────────────────────────────────────────────────────
\RequirePackage{times}

% ── Spacing ──────────────────────────────────────────────────────────
\renewcommand{\baselinestretch}{1.0}
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

% ── Section formatting ───────────────────────────────────────────────
\renewcommand{\section}{\@startsection
  {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}%
  {1.0ex plus .2ex}{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection
  {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}%
  {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection
  {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}%
  {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}}

% ── Title formatting ────────────────────────────────────────────────
\def\@maketitle{%
  \vbox to 0pt{}%
  \vskip -0.5in
  \begin{center}%
    {\LARGE\bfseries \@title \par}%
    \vskip 0.3in
    \if@neurips@preprint
      {\large\textit{Preprint. Under review.}\par}%
      \vskip 0.1in
    \fi
    {\large
      \lineskip .5em
      \begin{tabular}[t]{c}%
        \@author
      \end{tabular}\par}%
    \vskip 0.3in
  \end{center}%
  \par
  \vskip 0.5em
}

% ── Abstract ─────────────────────────────────────────────────────────
\renewenvironment{abstract}{%
  \centerline{\large\bfseries Abstract}%
  \vspace{0.5ex}%
  \begin{quote}%
}{%
  \par
  \end{quote}%
  \vskip 1ex
}

% ── Headers ──────────────────────────────────────────────────────────
\RequirePackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\fancyhead[C]{}
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0pt}

% ── Natbib ───────────────────────────────────────────────────────────
\if@neurips@nonatbib\else
  \RequirePackage[numbers,sort&compress]{natbib}
\fi

% ── Hyperref-friendly ────────────────────────────────────────────────
\AtBeginDocument{%
  \@ifpackageloaded{hyperref}{%
    \hypersetup{colorlinks=true,linkcolor=red,citecolor=green,urlcolor=blue}%
  }{}%
}

\endinput


================================================
FILE: researchclaw/trends/__init__.py
================================================
"""Research trend tracking and automatic topic generation."""

from researchclaw.trends.daily_digest import DailyDigest
from researchclaw.trends.trend_analyzer import TrendAnalyzer
from researchclaw.trends.opportunity_finder import OpportunityFinder
from researchclaw.trends.auto_topic import AutoTopicGenerator
from researchclaw.trends.feeds import FeedManager

__all__ = [
    "AutoTopicGenerator",
    "DailyDigest",
    "FeedManager",
    "OpportunityFinder",
    "TrendAnalyzer",
]


================================================
FILE: researchclaw/trends/auto_topic.py
================================================
"""Automatic research topic generation (ClawZero mode)."""

from __future__ import annotations

import logging
from typing import Any

from researchclaw.trends.opportunity_finder import OpportunityFinder
from researchclaw.trends.trend_analyzer import TrendAnalyzer

logger = logging.getLogger(__name__)


class AutoTopicGenerator:
    """Generate and rank candidate research topics automatically."""

    def __init__(
        self,
        trend_analyzer: TrendAnalyzer,
        opportunity_finder: OpportunityFinder,
        llm_client: Any = None,
    ):
        self.trend_analyzer = trend_analyzer
        self.opportunity_finder = opportunity_finder
        self.llm = llm_client

    async def generate_candidates(
        self,
        domains: list[str],
        papers: list[dict[str, Any]] | None = None,
        count: int = 5,
    ) -> list[dict[str, Any]]:
        """Generate ranked candidate research topics."""
        # 1. Analyze trends
        trend_analysis = self.trend_analyzer.analyze(papers or [])

        # 2. Find opportunities
        opportunities = await self.opportunity_finder.find_opportunities(
            trend_analysis, domains
        )

        # 3. Score and rank candidates
        candidates = []
        for opp in opportunities[:count]:
            score = self._score_candidate(opp, trend_analysis)
            candidates.append({
                "topic": opp["topic"],
                "rationale": opp.get("rationale", ""),
                "feasibility": opp.get("feasibility", "medium"),
                "novelty_score": score["novelty"],
                "feasibility_score": score["feasibility"],
                "impact_score": score["impact"],
                "overall_score": score["overall"],
                "source": opp.get("source", "unknown"),
            })

        candidates.sort(key=lambda c: -c["overall_score"])
        return candidates[:count]

    async def auto_select(
        self,
        domains: list[str],
        papers: list[dict[str, Any]] | None = None,
    ) -> dict[str, Any]:
        """Fully automatic topic selection (Zero-Touch mode)."""
        candidates = await self.generate_candidates(domains, papers, count=5)
        if not candidates:
            return {
                "topic": f"Novel approaches in {domains[0] if domains else 'ML'}",
                "rationale": "Default topic (no trends data available)",
                "overall_score": 0.0,
                "source": "default",
            }
        return candidates[0]

    @staticmethod
    def _score_candidate(
        opportunity: dict[str, Any],
        trend_analysis: dict[str, Any],
    ) -> dict[str, float]:
        """Score a candidate topic on novelty, feasibility, and impact."""
        feasibility_map = {"high": 0.9, "medium": 0.6, "low": 0.3}
        feasibility = feasibility_map.get(
            opportunity.get("feasibility", "medium"), 0.6
        )

        # Novelty: inverse of how much it's already been studied
        topic_words = set(opportunity.get("topic", "").lower().split())
        keyword_overlap = 0
        for kw in trend_analysis.get("rising_keywords", []):
            kw_words = set(kw.get("keyword", "").lower().split())
            if topic_words & kw_words:
                keyword_overlap += 1

        novelty = max(0.3, 1.0 - keyword_overlap * 0.15)

        # Impact: based on trend momentum
        paper_count = trend_analysis.get("paper_count", 0)
        impact = min(1.0, paper_count / 50) if paper_count > 0 else 0.5

        overall = round(
            0.4 * novelty + 0.3 * feasibility + 0.3 * impact, 3
        )

        return {
            "novelty": round(novelty, 3),
            "feasibility": round(feasibility, 3),
            "impact": round(impact, 3),
            "overall": overall,
        }

    def format_candidates(
        self,
        candidates: list[dict[str, Any]],
    ) -> str:
        """Format candidates as a readable string."""
        if not candidates:
            return "No candidate topics generated."

        lines = ["Candidate Research Topics:", "=" * 40, ""]
        for i, c in enumerate(candidates, 1):
            lines.extend([
                f"{i}. {c['topic']}",
                f"   Score: {c['overall_score']:.2f} "
                f"(novelty={c['novelty_score']:.2f}, "
                f"feasibility={c['feasibility_score']:.2f}, "
                f"impact={c['impact_score']:.2f})",
                f"   Rationale: {c.get('rationale', 'N/A')}",
                "",
            ])
        return "\n".join(lines)


================================================
FILE: researchclaw/trends/daily_digest.py
================================================
"""Daily paper digest generation."""

from __future__ import annotations

import logging
from datetime import date
from pathlib import Path
from typing import Any

from researchclaw.trends.feeds import FeedManager

logger = logging.getLogger(__name__)


class DailyDigest:
    """Generate daily paper digest reports."""

    def __init__(
        self,
        feed_manager: FeedManager,
        llm_client: Any = None,
    ):
        self.feeds = feed_manager
        self.llm = llm_client

    async def generate(
        self,
        domains: list[str] | None = None,
        max_papers: int = 20,
        target_date: date | None = None,
    ) -> str:
        """Generate a daily paper digest as Markdown."""
        effective_domains = domains or ["machine learning"]
        today = target_date or date.today()

        papers = self.feeds.fetch_recent_papers(
            domains=effective_domains,
            max_papers=max_papers,
            since_date=today,
        )

        if not papers:
            return (
                f"## Daily Paper Digest ({today})\n\n"
                f"No new papers found for domains: {', '.join(effective_domains)}\n"
            )

        if self.llm is not None:
            return await self._generate_with_llm(papers, effective_domains, today)
        return self._generate_basic(papers, effective_domains, today)

    async def _generate_with_llm(
        self,
        papers: list[dict[str, Any]],
        domains: list[str],
        today: date,
    ) -> str:
        """Generate digest with LLM-enhanced summaries."""
        lines = [
            f"## Daily Paper Digest ({today})",
            f"Domains: {', '.join(domains)}",
            f"Papers found: {len(papers)}",
            "",
        ]

        for i, paper in enumerate(papers, 1):
            title = paper.get("title", "Untitled")
            url = paper.get("url", "")
            abstract = paper.get("abstract", "")[:500]
            authors = paper.get("authors", [])

            if isinstance(authors, list):
                author_str = ", ".join(
                    a if isinstance(a, str) else a.get("name", "")
                    for a in authors[:3]
                )
                if len(authors) > 3:
                    author_str += " et al."
            else:
                author_str = str(authors)

            # Get LLM summary
            try:
                prompt = (
                    f"Summarize this paper in 2 sentences and rate its relevance "
                    f"to {', '.join(domains)} on a scale of 1-5 stars.\n\n"
                    f"Title: {title}\nAbstract: {abstract}\n\n"
                    f"Format: SUMMARY: <text> | RELEVANCE: <1-5>"
                )
                response = await self.llm.chat_async(prompt)
                summary, relevance = self._parse_summary(response)
            except Exception:
                summary = abstract[:200] + "..." if len(abstract) > 200 else abstract
                relevance = 3

            stars = "*" * relevance
            link = f"[{title}]({url})" if url else title
            lines.extend([
                f"### {i}. {link}",
                f"**Authors**: {author_str}",
                f"**Relevance**: {stars}",
                f"**Summary**: {summary}",
                "",
            ])

        return "\n".join(lines)

    def _generate_basic(
        self,
        papers: list[dict[str, Any]],
        domains: list[str],
        today: date,
    ) -> str:
        """Generate basic digest without LLM."""
        lines = [
            f"## Daily Paper Digest ({today})",
            f"Domains: {', '.join(domains)}",
            f"Papers found: {len(papers)}",
            "",
        ]

        for i, paper in enumerate(papers, 1):
            title = paper.get("title", "Untitled")
            url = paper.get("url", "")
            abstract = paper.get("abstract", "")
            authors = paper.get("authors", [])

            if isinstance(authors, list):
                author_str = ", ".join(
                    a if isinstance(a, str) else a.get("name", "")
                    for a in authors[:3]
                )
                if len(authors) > 3:
                    author_str += " et al."
            else:
                author_str = str(authors)

            short_abstract = (
                abstract[:200] + "..." if len(abstract) > 200 else abstract
            )
            link = f"[{title}]({url})" if url else title
            lines.extend([
                f"### {i}. {link}",
                f"**Authors**: {author_str}",
                f"**Abstract**: {short_abstract}",
                "",
            ])

        return "\n".join(lines)

    @staticmethod
    def _parse_summary(response: str) -> tuple[str, int]:
        """Parse LLM summary response."""
        summary = response
        relevance = 3

        if "SUMMARY:" in response:
            parts = response.split("|")
            summary = parts[0].split("SUMMARY:", 1)[-1].strip()
            if len(parts) > 1 and "RELEVANCE:" in parts[1]:
                try:
                    rel_str = parts[1].split("RELEVANCE:", 1)[-1].strip()
                    relevance = int(rel_str.strip("* "))
                    relevance = max(1, min(5, relevance))
                except (ValueError, IndexError):
                    pass

        return summary, relevance

    async def generate_and_save(
        self,
        output_dir: Path,
        domains: list[str] | None = None,
        max_papers: int = 20,
    ) -> Path:
        """Generate digest and save to a file."""
        today = date.today()
        content = await self.generate(domains, max_papers, today)
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / f"digest_{today}.md"
        output_file.write_text(content, encoding="utf-8")
        return output_file


================================================
FILE: researchclaw/trends/feeds.py
================================================
"""ArXiv / Semantic Scholar / OpenAlex feed management."""

from __future__ import annotations

import logging
from datetime import date, datetime
from typing import Any

logger = logging.getLogger(__name__)


class FeedManager:
    """Manage paper feeds from multiple sources."""

    SUPPORTED_SOURCES = ("arxiv", "semantic_scholar", "openalex")

    def __init__(
        self,
        sources: tuple[str, ...] = ("arxiv", "semantic_scholar"),
        s2_api_key: str = "",
    ):
        self.sources = tuple(
            s for s in sources if s in self.SUPPORTED_SOURCES
        )
        self.s2_api_key = s2_api_key

    def fetch_recent_papers(
        self,
        domains: list[str],
        max_papers: int = 20,
        since_date: date | None = None,
    ) -> list[dict[str, Any]]:
        """Fetch recent papers from configured sources.

        Returns a list of paper dicts with: title, authors, abstract,
        url, source, published_date, domains.
        """
        all_papers: list[dict[str, Any]] = []
        target_date = since_date or date.today()

        for source in self.sources:
            try:
                if source == "arxiv":
                    papers = self._fetch_arxiv(domains, max_papers, target_date)
                elif source == "semantic_scholar":
                    papers = self._fetch_s2(domains, max_papers, target_date)
                elif source == "openalex":
                    papers = self._fetch_openalex(domains, max_papers, target_date)
                else:
                    continue
                all_papers.extend(papers)
            except Exception as exc:
                logger.warning("Feed fetch failed for %s: %s", source, exc)

        # Deduplicate by title similarity
        seen_titles: set[str] = set()
        deduped: list[dict[str, Any]] = []
        for paper in all_papers:
            norm_title = paper.get("title", "").lower().strip()
            if norm_title and norm_title not in seen_titles:
                seen_titles.add(norm_title)
                deduped.append(paper)

        return deduped[:max_papers]

    def _fetch_arxiv(
        self,
        domains: list[str],
        max_papers: int,
        since_date: date,
    ) -> list[dict[str, Any]]:
        """Fetch papers from arXiv API."""
        try:
            from researchclaw.literature.arxiv_client import search_arxiv
        except ImportError:
            logger.debug("arxiv_client not available")
            return []

        query = " OR ".join(domains) if domains else "machine learning"
        try:
            results = search_arxiv(query, limit=max_papers)
            return [
                {
                    "title": r.get("title", ""),
                    "authors": r.get("authors", []),
                    "abstract": r.get("abstract", ""),
                    "url": r.get("url", ""),
                    "source": "arxiv",
                    "published_date": r.get("published", since_date.isoformat()),
                    "arxiv_id": r.get("arxiv_id", ""),
                }
                for r in results
            ]
        except Exception as exc:
            logger.warning("ArXiv fetch failed: %s", exc)
            return []

    def _fetch_s2(
        self,
        domains: list[str],
        max_papers: int,
        since_date: date,
    ) -> list[dict[str, Any]]:
        """Fetch papers from Semantic Scholar API."""
        try:
            from researchclaw.literature.semantic_scholar import search_s2
        except ImportError:
            logger.debug("semantic_scholar client not available")
            return []

        query = " ".join(domains) if domains else "machine learning"
        try:
            results = search_s2(
                query,
                limit=max_papers,
                year_min=since_date.year,
                api_key=self.s2_api_key,
            )
            return [
                {
                    "title": r.get("title", ""),
                    "authors": [
                        a.get("name", "") for a in r.get("authors", [])
                    ],
                    "abstract": r.get("abstract", ""),
                    "url": r.get("url", ""),
                    "source": "semantic_scholar",
                    "published_date": str(r.get("year", since_date.year)),
                    "citation_count": r.get("citationCount", 0),
                }
                for r in results
            ]
        except Exception as exc:
            logger.warning("S2 fetch failed: %s", exc)
            return []

    def _fetch_openalex(
        self,
        domains: list[str],
        max_papers: int,
        since_date: date,
    ) -> list[dict[str, Any]]:
        """Fetch papers from OpenAlex API."""
        try:
            from researchclaw.literature.openalex_client import search_openalex
        except ImportError:
            logger.debug("openalex_client not available")
            return []

        query = " ".join(domains) if domains else "machine learning"
        try:
            results = search_openalex(query, limit=max_papers)
            return [
                {
                    "title": r.get("title", ""),
                    "authors": r.get("authors", []),
                    "abstract": r.get("abstract", ""),
                    "url": r.get("url", ""),
                    "source": "openalex",
                    "published_date": r.get("publication_date", ""),
                    "citation_count": r.get("cited_by_count", 0),
                }
                for r in results
            ]
        except Exception as exc:
            logger.warning("OpenAlex fetch failed: %s", exc)
            return []


================================================
FILE: researchclaw/trends/opportunity_finder.py
================================================
"""Research opportunity discovery."""

from __future__ import annotations

import logging
from typing import Any

logger = logging.getLogger(__name__)


class OpportunityFinder:
    """Identify research opportunities from trend analysis."""

    def __init__(self, llm_client: Any = None):
        self.llm = llm_client

    async def find_opportunities(
        self,
        trend_analysis: dict[str, Any],
        domains: list[str],
    ) -> list[dict[str, Any]]:
        """Identify research gaps and opportunities."""
        if self.llm is not None:
            return await self._llm_find_opportunities(trend_analysis, domains)
        return self._heuristic_find_opportunities(trend_analysis, domains)

    async def _llm_find_opportunities(
        self,
        trend_analysis: dict[str, Any],
        domains: list[str],
    ) -> list[dict[str, Any]]:
        """Use LLM to identify research opportunities."""
        keywords = trend_analysis.get("rising_keywords", [])[:10]
        methods = trend_analysis.get("method_trends", [])[:5]

        prompt = (
            "Based on the following research trends, identify 5 promising "
            "research opportunities:\n\n"
            f"Domains: {', '.join(domains)}\n"
            f"Trending keywords: {[k['keyword'] for k in keywords]}\n"
            f"Popular methods: {[m['method'] for m in methods]}\n\n"
            "For each opportunity, provide:\n"
            "1. A concise research question\n"
            "2. Why it's promising (1 sentence)\n"
            "3. Feasibility estimate (high/medium/low)\n\n"
            "Format each as: TOPIC: ... | WHY: ... | FEASIBILITY: ..."
        )

        try:
            response = await self.llm.chat_async(prompt)
            return self._parse_opportunities(response)
        except Exception as exc:
            logger.warning("LLM opportunity finding failed: %s", exc)
            return self._heuristic_find_opportunities(trend_analysis, domains)

    @staticmethod
    def _parse_opportunities(response: str) -> list[dict[str, Any]]:
        """Parse LLM response into structured opportunities."""
        opportunities = []
        for line in response.strip().split("\n"):
            line = line.strip()
            if not line or not any(
                marker in line for marker in ("TOPIC:", "topic:", "1.", "2.", "3.")
            ):
                continue

            parts = line.split("|")
            topic = parts[0].split(":", 1)[-1].strip() if parts else line
            why = parts[1].split(":", 1)[-1].strip() if len(parts) > 1 else ""
            feasibility = (
                parts[2].split(":", 1)[-1].strip().lower()
                if len(parts) > 2
                else "medium"
            )

            if topic:
                opportunities.append({
                    "topic": topic,
                    "rationale": why,
                    "feasibility": feasibility,
                    "source": "llm",
                })

        return opportunities[:5]

    @staticmethod
    def _heuristic_find_opportunities(
        trend_analysis: dict[str, Any],
        domains: list[str],
    ) -> list[dict[str, Any]]:
        """Simple heuristic-based opportunity finding."""
        opportunities: list[dict[str, Any]] = []
        keywords = trend_analysis.get("rising_keywords", [])
        methods = trend_analysis.get("method_trends", [])

        # Combine trending keywords with methods for opportunity generation
        for i, kw in enumerate(keywords[:3]):
            for j, method in enumerate(methods[:2]):
                topic = (
                    f"Applying {method['method']} to "
                    f"{kw['keyword']} in {domains[0] if domains else 'ML'}"
                )
                opportunities.append({
                    "topic": topic,
                    "rationale": (
                        f"'{kw['keyword']}' is trending ({kw['count']} mentions) "
                        f"and '{method['method']}' is a popular method"
                    ),
                    "feasibility": "medium",
                    "source": "heuristic",
                })
                if len(opportunities) >= 5:
                    break
            if len(opportunities) >= 5:
                break

        return opportunities


================================================
FILE: researchclaw/trends/trend_analyzer.py
================================================
"""Research trend analysis engine."""

from __future__ import annotations

import re
import logging
from collections import Counter
from typing import Any

logger = logging.getLogger(__name__)

# Common stopwords to exclude from keyword analysis
_STOPWORDS = frozenset({
    "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
    "of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
    "being", "have", "has", "had", "do", "does", "did", "will", "would",
    "could", "should", "may", "might", "shall", "can", "need", "must",
    "that", "this", "these", "those", "it", "its", "we", "our", "their",
    "which", "what", "how", "when", "where", "who", "whom", "why",
    "not", "no", "nor", "as", "if", "then", "than", "both", "each",
    "all", "any", "few", "more", "most", "some", "such", "only", "very",
    "also", "about", "up", "out", "so", "into", "over", "after", "before",
    "between", "under", "through", "during", "using", "based", "via",
    "paper", "propose", "proposed", "method", "approach", "results", "show",
    "new", "novel", "model", "models", "data", "dataset", "task", "tasks",
    "performance", "learning", "training",
})


class TrendAnalyzer:
    """Analyze research trends from paper collections."""

    def __init__(self, min_keyword_length: int = 3):
        self.min_keyword_length = min_keyword_length

    def analyze(
        self,
        papers: list[dict[str, Any]],
        window_days: int = 30,
    ) -> dict[str, Any]:
        """Analyze trends in a collection of papers."""
        if not papers:
            return {
                "rising_keywords": [],
                "hot_authors": [],
                "popular_datasets": [],
                "method_trends": [],
                "paper_count": 0,
            }

        keywords = self._extract_keywords(papers)
        authors = self._extract_authors(papers)
        datasets = self._extract_datasets(papers)
        methods = self._extract_methods(papers)

        return {
            "rising_keywords": keywords[:20],
            "hot_authors": authors[:10],
            "popular_datasets": datasets[:10],
            "method_trends": methods[:10],
            "paper_count": len(papers),
            "source_distribution": self._source_distribution(papers),
        }

    def _extract_keywords(
        self,
        papers: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Extract and rank keywords from paper titles and abstracts."""
        word_counts: Counter[str] = Counter()
        bigram_counts: Counter[str] = Counter()

        for paper in papers:
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
            words = self._tokenize(text)

            for w in words:
                if w not in _STOPWORDS and len(w) >= self.min_keyword_length:
                    word_counts[w] += 1

            for i in range(len(words) - 1):
                w1, w2 = words[i], words[i + 1]
                if (
                    w1 not in _STOPWORDS
                    and w2 not in _STOPWORDS
                    and len(w1) >= self.min_keyword_length
                ):
                    bigram_counts[f"{w1} {w2}"] += 1

        results = []
        for keyword, count in bigram_counts.most_common(30):
            if count >= 2:
                results.append({"keyword": keyword, "count": count, "type": "bigram"})
        for keyword, count in word_counts.most_common(30):
            if count >= 2:
                results.append({"keyword": keyword, "count": count, "type": "unigram"})

        results.sort(key=lambda x: -x["count"])
        return results[:20]

    def _extract_authors(
        self,
        papers: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Extract most prolific authors."""
        author_counts: Counter[str] = Counter()
        for paper in papers:
            authors = paper.get("authors", [])
            if isinstance(authors, list):
                for author in authors:
                    name = author if isinstance(author, str) else author.get("name", "")
                    if name:
                        author_counts[name] += 1

        return [
            {"author": name, "paper_count": count}
            for name, count in author_counts.most_common(10)
            if count >= 2
        ]

    def _extract_datasets(
        self,
        papers: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Extract commonly mentioned datasets."""
        dataset_patterns = [
            "ImageNet", "CIFAR", "MNIST", "COCO", "SQuAD", "GLUE",
            "SuperGLUE", "WikiText", "Penn Treebank", "WMT",
            "OpenWebText", "Common Crawl", "BookCorpus",
            "MMLU", "HumanEval", "GSM8K", "ARC", "HellaSwag",
        ]
        dataset_counts: Counter[str] = Counter()
        for paper in papers:
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
            for ds in dataset_patterns:
                if ds.lower() in text.lower():
                    dataset_counts[ds] += 1

        return [
            {"dataset": ds, "mention_count": count}
            for ds, count in dataset_counts.most_common(10)
            if count >= 1
        ]

    def _extract_methods(
        self,
        papers: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        """Extract commonly mentioned methods/architectures."""
        method_patterns = [
            "transformer", "attention", "diffusion", "GAN", "VAE",
            "reinforcement learning", "contrastive learning",
            "self-supervised", "few-shot", "zero-shot", "in-context",
            "fine-tuning", "pre-training", "RLHF", "DPO",
            "chain-of-thought", "retrieval-augmented", "RAG",
            "mixture of experts", "MoE", "LoRA", "quantization",
            "knowledge distillation", "pruning", "graph neural",
        ]
        method_counts: Counter[str] = Counter()
        for paper in papers:
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
            for method in method_patterns:
                if method.lower() in text.lower():
                    method_counts[method] += 1

        return [
            {"method": method, "mention_count": count}
            for method, count in method_counts.most_common(10)
            if count >= 1
        ]

    @staticmethod
    def _source_distribution(
        papers: list[dict[str, Any]],
    ) -> dict[str, int]:
        """Count papers by source."""
        dist: Counter[str] = Counter()
        for paper in papers:
            dist[paper.get("source", "unknown")] += 1
        return dict(dist)

    @staticmethod
    def _tokenize(text: str) -> list[str]:
        """Simple word tokenization."""
        return [w.lower() for w in re.findall(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)*", text)]

    def generate_trend_report(
        self,
        analysis: dict[str, Any],
    ) -> str:
        """Format trend analysis as a readable report."""
        lines = [
            f"Research Trend Analysis ({analysis.get('paper_count', 0)} papers)",
            "=" * 50,
            "",
        ]

        keywords = analysis.get("rising_keywords", [])
        if keywords:
            lines.append("Top Keywords:")
            for kw in keywords[:10]:
                lines.append(f"  - {kw['keyword']} ({kw['count']} mentions)")
            lines.append("")

        authors = analysis.get("hot_authors", [])
        if authors:
            lines.append("Most Active Authors:")
            for a in authors[:5]:
                lines.append(f"  - {a['author']} ({a['paper_count']} papers)")
            lines.append("")

        methods = analysis.get("method_trends", [])
        if methods:
            lines.append("Method Trends:")
            for m in methods[:5]:
                lines.append(f"  - {m['method']} ({m['mention_count']} mentions)")
            lines.append("")

        return "\n".join(lines)


================================================
FILE: researchclaw/utils/__init__.py
================================================
"""ResearchClaw utility functions."""

from researchclaw.utils.sanitize import sanitize_figure_id

__all__ = ["sanitize_figure_id"]


================================================
FILE: researchclaw/utils/sanitize.py
================================================
"""Input sanitization utilities for untrusted LLM-generated values."""

from __future__ import annotations

import re


def sanitize_figure_id(raw_id: str, *, fallback: str = "figure") -> str:
    """Sanitize a figure ID for safe use in file paths and Docker names.

    Strips path separators, dotdot sequences, and shell metacharacters.
    Returns *fallback* if the sanitized result is empty.

    >>> sanitize_figure_id("../../etc/evil")
    'etc_evil'
    >>> sanitize_figure_id("fig test (v2)")
    'fig_test_v2'
    >>> sanitize_figure_id("")
    'figure'
    """
    # Replace path separators and dangerous sequences
    cleaned = raw_id.replace("..", "").replace("/", "_").replace("\\", "_")
    # Keep only safe characters: alphanumeric, hyphen, underscore, dot
    cleaned = re.sub(r"[^a-zA-Z0-9_.-]", "_", cleaned)
    # Collapse multiple underscores
    cleaned = re.sub(r"_+", "_", cleaned).strip("_.")
    return cleaned or fallback


================================================
FILE: researchclaw/utils/thinking_tags.py
================================================
"""Strip reasoning artifacts from LLM output before they leak into papers.

Handles ALL known thinking/reasoning formats:
- ``<think>...</think>`` -- DeepSeek-R1, QwQ, Gemini 2.5 format
- ``[thinking] ...`` -- Claude Code / ACP output format (bracket-style)
- Insight blocks -- Claude Code explanatory mode decorators
- ``[plan] ...`` -- Claude Code plan mode markers
- ``[tool] ...`` -- ACP tool invocation output
- ``[client] ...``, ``[acpx] ...``, ``[done] ...`` -- acpx metadata

Without this stripping, these artifacts contaminate:
- Paper drafts (LaTeX / Markdown)
- Generated experiment code
- YAML/JSON responses (search plans, experiment plans)
- Citation references

Usage::

    from researchclaw.utils.thinking_tags import strip_thinking_tags

    clean = strip_thinking_tags(raw_llm_output)
"""

from __future__ import annotations

import re

# ---------------------------------------------------------------------------
# Pattern 1: XML-style <think>...</think> (DeepSeek-R1, QwQ, Gemini)
# ---------------------------------------------------------------------------

_THINK_BLOCK_RE = re.compile(
    r"<think>.*?</think>",
    re.DOTALL | re.IGNORECASE,
)
_THINK_UNCLOSED_RE = re.compile(
    r"<think>.*",
    re.DOTALL | re.IGNORECASE,
)
_THINK_STRAY_CLOSE_RE = re.compile(
    r"</think>",
    re.IGNORECASE,
)

# ---------------------------------------------------------------------------
# Pattern 2: [thinking] blocks (Claude Code / ACP)
# ---------------------------------------------------------------------------

_BRACKET_THINKING_RE = re.compile(
    r"\[thinking\].*?(?=\n\n(?!\[thinking\])|\n(?:#{1,3}\s)|\n```|\Z)",
    re.DOTALL | re.IGNORECASE,
)

# ---------------------------------------------------------------------------
# Pattern 3: Insight blocks (Claude Code explanatory style)
# ---------------------------------------------------------------------------

_INSIGHT_BLOCK_RE = re.compile(
    r"`[*\u2605]\s*Insight[^`]*`\s*\n.*?`[\u2500-]+`",
    re.DOTALL,
)
_INSIGHT_ASCII_RE = re.compile(
    r"`\*\s*Insight[-]+`\s*\n.*?`[-]+`",
    re.DOTALL,
)

# ---------------------------------------------------------------------------
# Pattern 4: [plan] blocks (Claude Code plan mode)
# ---------------------------------------------------------------------------

_PLAN_BLOCK_RE = re.compile(
    r"\[plan\].*?(?=\n\n|\Z)",
    re.DOTALL,
)

# ---------------------------------------------------------------------------
# Pattern 5: ACP/acpx metadata lines
# ---------------------------------------------------------------------------

_ACPX_LINE_RE = re.compile(
    r"^\[(client|acpx|tool|done)\](?!\().*$",
    re.MULTILINE | re.IGNORECASE,
)


def strip_thinking_tags(text: str) -> str:
    """Remove all reasoning artifacts from LLM output.

    Handles XML <think> tags, bracket [thinking] blocks, insight
    decorators, plan markers, and acpx metadata.

    Returns cleaned text suitable for paper drafts, code, or YAML/JSON.
    """
    if not text:
        return text

    result = text

    # Phase 1: XML <think>...</think> blocks
    if "think" in result.lower():
        result = _THINK_BLOCK_RE.sub("", result)
        result = _THINK_UNCLOSED_RE.sub("", result)
        result = _THINK_STRAY_CLOSE_RE.sub("", result)

    # Phase 2: [thinking] blocks (ACP/Claude Code)
    if "[thinking]" in result.lower():
        result = _BRACKET_THINKING_RE.sub("", result)
        result = re.sub(
            r"^\[thinking\].*$", "", result,
            flags=re.MULTILINE | re.IGNORECASE,
        )

    # Phase 3: Insight blocks
    result = _INSIGHT_BLOCK_RE.sub("", result)
    result = _INSIGHT_ASCII_RE.sub("", result)

    # Phase 4: [plan] blocks
    if "[plan]" in result.lower():
        result = _PLAN_BLOCK_RE.sub("", result)

    # Phase 5: acpx metadata lines
    result = _ACPX_LINE_RE.sub("", result)

    # Phase 6: Clean up artifacts
    result = re.sub(r"^`[\u2500-]+`\s*$", "", result, flags=re.MULTILINE)
    result = re.sub(r"^`[-]{20,}`\s*$", "", result, flags=re.MULTILINE)

    # Collapse excessive blank lines
    result = re.sub(r"\n{3,}", "\n\n", result)

    return result.strip()


================================================
FILE: researchclaw/voice/__init__.py
================================================
"""Voice interaction modules."""


================================================
FILE: researchclaw/voice/commands.py
================================================
"""Voice command parsing."""

from __future__ import annotations

import re
from dataclasses import dataclass
from enum import Enum


class VoiceCommand(str, Enum):
    """Recognized voice commands."""

    START = "start"
    STOP = "stop"
    PAUSE = "pause"
    RESUME = "resume"
    STATUS = "status"
    NONE = "none"  # Not a command, forward to chat


@dataclass
class ParsedVoiceInput:
    """Result of parsing voice input."""

    command: VoiceCommand
    text: str  # original or remaining text


# Command patterns (Chinese + English)
_COMMAND_PATTERNS: list[tuple[VoiceCommand, re.Pattern[str]]] = [
    (VoiceCommand.START, re.compile(r"^(?:start|run|开始|启动)", re.IGNORECASE)),
    (VoiceCommand.STOP, re.compile(r"^(?:stop|停止|结束|终止)", re.IGNORECASE)),
    (VoiceCommand.PAUSE, re.compile(r"^(?:pause|暂停|等一下)", re.IGNORECASE)),
    (VoiceCommand.RESUME, re.compile(r"^(?:resume|continue|继续|恢复)", re.IGNORECASE)),
    (VoiceCommand.STATUS, re.compile(r"^(?:status|progress|进度|到哪了|查看)", re.IGNORECASE)),
]


def parse_voice_input(text: str) -> ParsedVoiceInput:
    """Parse transcribed voice input into command + text."""
    stripped = text.strip()
    for cmd, pattern in _COMMAND_PATTERNS:
        if pattern.search(stripped):
            return ParsedVoiceInput(command=cmd, text=stripped)

    return ParsedVoiceInput(command=VoiceCommand.NONE, text=stripped)


================================================
FILE: researchclaw/voice/synthesizer.py
================================================
"""Text-to-speech synthesis."""

from __future__ import annotations

import logging
from typing import Any

logger = logging.getLogger(__name__)


class VoiceSynthesizer:
    """Convert text to speech audio."""

    def __init__(self, server_config: Any) -> None:
        self._config = server_config

    async def synthesize(
        self,
        text: str,
        voice: str = "alloy",
        speed: float = 1.0,
    ) -> bytes:
        """Synthesize text to audio bytes using OpenAI TTS API."""
        try:
            import httpx
        except ImportError:
            raise RuntimeError("httpx required for TTS")

        import os

        api_key = os.environ.get("OPENAI_API_KEY", "")
        if not api_key:
            raise RuntimeError("OPENAI_API_KEY not set for TTS")

        async with httpx.AsyncClient(timeout=60.0) as client:
            response = await client.post(
                "https://api.openai.com/v1/audio/speech",
                headers={"Authorization": f"Bearer {api_key}"},
                json={
                    "model": "tts-1",
                    "input": text,
                    "voice": voice,
                    "speed": speed,
                },
            )
            response.raise_for_status()
            return response.content


================================================
FILE: researchclaw/voice/transcriber.py
================================================
"""Voice transcription via Whisper API."""

from __future__ import annotations

import logging
from typing import Any, AsyncIterator

logger = logging.getLogger(__name__)


class VoiceTranscriber:
    """Transcribe audio to text using Whisper API."""

    def __init__(self, server_config: Any) -> None:
        self._model = server_config.whisper_model
        self._api_url = server_config.whisper_api_url

    async def transcribe(
        self,
        audio_bytes: bytes,
        language: str = "zh",
    ) -> str:
        """Transcribe audio bytes to text.

        Uses OpenAI Whisper API or compatible endpoint.
        """
        try:
            import httpx
        except ImportError:
            raise RuntimeError(
                "httpx is required for voice transcription. "
                "Install with: pip install httpx"
            )

        url = self._api_url or "https://api.openai.com/v1/audio/transcriptions"

        import os

        api_key = os.environ.get("OPENAI_API_KEY", "")
        if not api_key:
            raise RuntimeError("OPENAI_API_KEY not set for Whisper API")

        async with httpx.AsyncClient(timeout=60.0) as client:
            response = await client.post(
                url,
                headers={"Authorization": f"Bearer {api_key}"},
                files={"file": ("audio.webm", audio_bytes, "audio/webm")},
                data={
                    "model": self._model,
                    "language": language,
                },
            )
            response.raise_for_status()
            result = response.json()
            return result.get("text", "")

    async def transcribe_stream(
        self,
        audio_stream: AsyncIterator[bytes],
        language: str = "zh",
    ) -> AsyncIterator[str]:
        """Stream transcription (collects chunks then transcribes)."""
        chunks: list[bytes] = []
        async for chunk in audio_stream:
            chunks.append(chunk)

        if chunks:
            full_audio = b"".join(chunks)
            text = await self.transcribe(full_audio, language=language)
            yield text


================================================
FILE: researchclaw/web/__init__.py
================================================
"""Web search, crawling, and content extraction layer.

Provides unified access to:
- **Crawl4AI**: Web page → Markdown extraction
- **Tavily**: AI-native web search API
- **scholarly**: Google Scholar search
- **PDF extraction**: Full-text from PDF files

Public API
----------
- ``WebSearchAgent`` — orchestrates all web capabilities
- ``WebCrawler`` — Crawl4AI wrapper
- ``WebSearchClient`` — Tavily search wrapper
- ``GoogleScholarClient`` — scholarly wrapper
- ``PDFExtractor`` — PDF text extraction
- ``check_url_ssrf`` — SSRF validation for URLs
"""

from researchclaw.web._ssrf import check_url_ssrf
from researchclaw.web.crawler import WebCrawler
from researchclaw.web.search import WebSearchClient
from researchclaw.web.scholar import GoogleScholarClient
from researchclaw.web.pdf_extractor import PDFExtractor
from researchclaw.web.agent import WebSearchAgent

__all__ = [
    "check_url_ssrf",
    "WebCrawler",
    "WebSearchClient",
    "GoogleScholarClient",
    "PDFExtractor",
    "WebSearchAgent",
]


================================================
FILE: researchclaw/web/_ssrf.py
================================================
"""SSRF validation for URLs fetched by the web layer."""

from __future__ import annotations

import ipaddress
import socket
from urllib.parse import urlparse


def check_url_ssrf(url: str) -> str | None:
    """Return an error message if *url* targets a private/internal host.

    Validates scheme (http/https only) and resolves the hostname to check
    against all RFC 1918, loopback, link-local, and reserved IP ranges
    using :func:`ipaddress.ip_address`.

    Returns ``None`` if the URL is safe to fetch.
    """
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        return f"Unsupported URL scheme: {parsed.scheme}"
    hostname = parsed.hostname or ""
    if not hostname:
        return "URL has no hostname"
    # Try parsing hostname as a literal IP address first
    try:
        addr = ipaddress.ip_address(hostname)
    except ValueError:
        # It's a domain name — resolve to IP
        try:
            info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
            addr = ipaddress.ip_address(info[0][4][0])
        except (socket.gaierror, OSError, IndexError):
            # Can't resolve — let the actual request fail naturally
            return None
    if addr.is_private or addr.is_loopback or addr.is_link_local or addr.is_reserved:
        return f"Blocked internal/private URL: {hostname}"
    return None


================================================
FILE: researchclaw/web/agent.py
================================================
"""Unified Web Search Agent.

Orchestrates all web capabilities (Tavily, Google Scholar, Crawl4AI,
PDF extraction) into a single search-and-extract pipeline.

Usage::

    agent = WebSearchAgent()
    result = agent.search_and_extract(
        topic="knowledge distillation for vision transformers",
        search_queries=["knowledge distillation survey", "ViT compression"],
    )
    # result.papers — Google Scholar papers
    # result.web_results — Tavily/DDG web search results
    # result.crawled_pages — full-text from crawled URLs
"""

from __future__ import annotations

import asyncio
import logging
import time
from dataclasses import dataclass, field
from typing import Any

from researchclaw.web.crawler import CrawlResult, WebCrawler
from researchclaw.web.pdf_extractor import PDFContent, PDFExtractor
from researchclaw.web.scholar import GoogleScholarClient, ScholarPaper
from researchclaw.web.search import SearchResult, WebSearchClient, WebSearchResponse

logger = logging.getLogger(__name__)


@dataclass
class WebSearchAgentResult:
    """Combined result from all web search sources."""

    topic: str
    web_results: list[SearchResult] = field(default_factory=list)
    scholar_papers: list[ScholarPaper] = field(default_factory=list)
    crawled_pages: list[CrawlResult] = field(default_factory=list)
    pdf_extractions: list[PDFContent] = field(default_factory=list)
    search_answer: str = ""  # Tavily AI answer if available
    elapsed_seconds: float = 0.0

    @property
    def total_results(self) -> int:
        return (
            len(self.web_results)
            + len(self.scholar_papers)
            + len(self.crawled_pages)
            + len(self.pdf_extractions)
        )

    def to_context_string(self, *, max_length: int = 30_000) -> str:
        """Convert all results to a single context string for LLM injection.

        The output is structured Markdown suitable for prompt injection.
        """
        parts: list[str] = []

        # Tavily AI answer
        if self.search_answer:
            parts.append("## AI Search Summary")
            parts.append(self.search_answer)
            parts.append("")

        # Web search results
        if self.web_results:
            parts.append("## Web Search Results")
            for i, r in enumerate(self.web_results[:15], 1):
                parts.append(f"### [{i}] {r.title}")
                parts.append(f"URL: {r.url}")
                if r.snippet:
                    parts.append(r.snippet)
                parts.append("")

        # Google Scholar papers
        if self.scholar_papers:
            parts.append("## Google Scholar Papers")
            for i, p in enumerate(self.scholar_papers[:10], 1):
                authors = ", ".join(p.authors[:3])
                if len(p.authors) > 3:
                    authors += " et al."
                parts.append(
                    f"- **{p.title}** ({authors}, {p.year}) "
                    f"[{p.citation_count} citations]"
                )
                if p.abstract:
                    parts.append(f"  {p.abstract[:200]}...")
            parts.append("")

        # Crawled page content
        if self.crawled_pages:
            parts.append("## Crawled Page Content")
            for cr in self.crawled_pages:
                if cr.has_content:
                    parts.append(f"### {cr.title or cr.url}")
                    parts.append(cr.markdown[:3000])
                    parts.append("")

        # PDF extractions
        if self.pdf_extractions:
            parts.append("## PDF Full-Text Extractions")
            for pdf in self.pdf_extractions:
                if pdf.has_content:
                    label = pdf.title or pdf.path
                    parts.append(f"### {label}")
                    if pdf.abstract:
                        parts.append(f"**Abstract:** {pdf.abstract}")
                    parts.append(pdf.text[:3000])
                    parts.append("")

        result = "\n".join(parts)
        if len(result) > max_length:
            result = result[:max_length] + "\n\n[... truncated]"
        return result

    def to_dict(self) -> dict[str, Any]:
        """Serialize to dict for JSON output."""
        return {
            "topic": self.topic,
            "web_results_count": len(self.web_results),
            "scholar_papers_count": len(self.scholar_papers),
            "crawled_pages_count": len(self.crawled_pages),
            "pdf_extractions_count": len(self.pdf_extractions),
            "has_search_answer": bool(self.search_answer),
            "elapsed_seconds": self.elapsed_seconds,
            "web_results": [r.to_dict() for r in self.web_results[:20]],
            "scholar_papers": [p.to_dict() for p in self.scholar_papers[:20]],
        }


class WebSearchAgent:
    """Orchestrates all web search and content extraction capabilities.

    Parameters
    ----------
    tavily_api_key:
        Tavily API key (optional, falls back to env var or DuckDuckGo).
    enable_scholar:
        Whether to include Google Scholar search.
    enable_crawling:
        Whether to crawl top URLs for full content.
    enable_pdf:
        Whether to extract PDF content.
    max_web_results:
        Maximum web search results.
    max_scholar_results:
        Maximum Google Scholar results.
    max_crawl_urls:
        Maximum URLs to crawl for full content.
    """

    def __init__(
        self,
        *,
        tavily_api_key: str = "",
        enable_scholar: bool = True,
        enable_crawling: bool = True,
        enable_pdf: bool = True,
        max_web_results: int = 10,
        max_scholar_results: int = 10,
        max_crawl_urls: int = 5,
    ) -> None:
        self.web_client = WebSearchClient(api_key=tavily_api_key)
        try:
            self.scholar_client = GoogleScholarClient()
        except ImportError:
            self.scholar_client = None  # type: ignore[assignment]
        self.crawler = WebCrawler()
        self.pdf_extractor = PDFExtractor()
        self.enable_scholar = enable_scholar
        self.enable_crawling = enable_crawling
        self.enable_pdf = enable_pdf
        self.max_web_results = max_web_results
        self.max_scholar_results = max_scholar_results
        self.max_crawl_urls = max_crawl_urls

    def search_and_extract(
        self,
        topic: str,
        *,
        search_queries: list[str] | None = None,
        crawl_urls: list[str] | None = None,
        pdf_urls: list[str] | None = None,
    ) -> WebSearchAgentResult:
        """Run the full search + extraction pipeline.

        Parameters
        ----------
        topic:
            Research topic string.
        search_queries:
            Custom search queries. If None, auto-generates from topic.
        crawl_urls:
            Specific URLs to crawl. If None, crawls top search result URLs.
        pdf_urls:
            Specific PDF URLs to extract. If None, extracts PDFs from search.
        """
        t0 = time.monotonic()
        result = WebSearchAgentResult(topic=topic)

        # 1. Generate search queries if not provided
        if search_queries is None:
            search_queries = self._generate_queries(topic)

        # 2. Web search (Tavily / DuckDuckGo)
        self._run_web_search(result, search_queries)

        # 3. Google Scholar search
        if self.enable_scholar and self.scholar_client and self.scholar_client.available:
            self._run_scholar_search(result, topic)

        # 4. Crawl top URLs for full content
        if self.enable_crawling:
            urls_to_crawl = crawl_urls or self._select_urls_to_crawl(result)
            if urls_to_crawl:
                self._run_crawling(result, urls_to_crawl)

        # 5. Extract PDFs
        if self.enable_pdf:
            pdf_targets = pdf_urls or self._find_pdf_urls(result)
            if pdf_targets:
                self._run_pdf_extraction(result, pdf_targets)

        result.elapsed_seconds = time.monotonic() - t0
        logger.info(
            "[WebSearchAgent] Done: %d web, %d scholar, %d crawled, %d PDFs (%.1fs)",
            len(result.web_results),
            len(result.scholar_papers),
            len(result.crawled_pages),
            len(result.pdf_extractions),
            result.elapsed_seconds,
        )
        return result

    # ------------------------------------------------------------------
    # Pipeline steps
    # ------------------------------------------------------------------

    def _run_web_search(
        self, result: WebSearchAgentResult, queries: list[str]
    ) -> None:
        """Run web search across all queries."""
        try:
            responses = self.web_client.search_multi(
                queries, max_results=self.max_web_results
            )
            for resp in responses:
                result.web_results.extend(resp.results)
                if resp.answer and not result.search_answer:
                    result.search_answer = resp.answer
        except Exception as exc:  # noqa: BLE001
            logger.warning("Web search failed: %s", exc)

    def _run_scholar_search(
        self, result: WebSearchAgentResult, topic: str
    ) -> None:
        """Run Google Scholar search."""
        try:
            papers = self.scholar_client.search(
                topic, limit=self.max_scholar_results
            )
            result.scholar_papers.extend(papers)
        except Exception as exc:  # noqa: BLE001
            logger.warning("Scholar search failed: %s", exc)

    def _run_crawling(
        self, result: WebSearchAgentResult, urls: list[str]
    ) -> None:
        """Crawl URLs for full content."""
        try:
            loop = None
            try:
                loop = asyncio.get_running_loop()
            except RuntimeError:
                pass

            if loop and loop.is_running():
                # We're inside an async context — use sync fallback
                for url in urls[: self.max_crawl_urls]:
                    cr = self.crawler.crawl_sync(url)
                    if cr.has_content:
                        result.crawled_pages.append(cr)
            else:
                crawl_results = asyncio.run(
                    self.crawler.crawl_many(urls[: self.max_crawl_urls])
                )
                result.crawled_pages.extend(
                    cr for cr in crawl_results if cr.has_content
                )
        except Exception as exc:  # noqa: BLE001
            logger.warning("Crawling failed: %s", exc)

    def _run_pdf_extraction(
        self, result: WebSearchAgentResult, urls: list[str]
    ) -> None:
        """Extract text from PDF URLs."""
        for url in urls[:5]:
            try:
                pdf = self.pdf_extractor.extract_from_url(url)
                if pdf.has_content:
                    result.pdf_extractions.append(pdf)
            except Exception as exc:  # noqa: BLE001
                logger.warning("PDF extraction failed for %s: %s", url, exc)

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _generate_queries(topic: str) -> list[str]:
        """Generate search queries from a topic string."""
        queries = [
            topic,
            f"{topic} survey",
            f"{topic} benchmark state of the art",
        ]
        return queries

    def _select_urls_to_crawl(self, result: WebSearchAgentResult) -> list[str]:
        """Select top URLs from search results for crawling."""
        urls = []
        seen = set()
        for r in result.web_results:
            if r.url and r.url not in seen:
                # Skip PDF URLs (handled separately) and common non-content sites
                if r.url.endswith(".pdf"):
                    continue
                seen.add(r.url)
                urls.append(r.url)
                if len(urls) >= self.max_crawl_urls:
                    break
        return urls

    @staticmethod
    def _find_pdf_urls(result: WebSearchAgentResult) -> list[str]:
        """Find PDF URLs from search results."""
        pdf_urls = []
        seen = set()
        for r in result.web_results:
            if r.url and r.url.endswith(".pdf") and r.url not in seen:
                seen.add(r.url)
                pdf_urls.append(r.url)
                if len(pdf_urls) >= 3:
                    break
        return pdf_urls


================================================
FILE: researchclaw/web/crawler.py
================================================
"""Web page → Markdown extraction powered by Crawl4AI.

Crawl4AI is the primary extraction engine (installed as a dependency).
A lightweight urllib fallback exists for environments where Crawl4AI's
browser dependency is not set up.

Usage::

    crawler = WebCrawler()
    result = await crawler.crawl("https://arxiv.org/abs/2301.00001")
    print(result.markdown)
"""

from __future__ import annotations

import asyncio
import logging
import re
import time
from dataclasses import dataclass, field
from typing import Any
from urllib.request import Request, urlopen

from researchclaw.web._ssrf import check_url_ssrf

logger = logging.getLogger(__name__)


@dataclass
class CrawlResult:
    """Result of crawling a single URL."""

    url: str
    markdown: str = ""
    title: str = ""
    success: bool = False
    error: str = ""
    metadata: dict[str, Any] = field(default_factory=dict)
    elapsed_seconds: float = 0.0

    @property
    def has_content(self) -> bool:
        return bool(self.markdown and len(self.markdown.strip()) > 50)


class WebCrawler:
    """Web page → Markdown crawler powered by Crawl4AI.

    Parameters
    ----------
    timeout:
        Request timeout in seconds.
    max_content_length:
        Maximum content length in characters (truncate beyond this).
    """

    def __init__(
        self,
        *,
        timeout: int = 30,
        max_content_length: int = 50_000,
        user_agent: str = "ResearchClaw/0.5 (Academic Research Bot)",
    ) -> None:
        self.timeout = timeout
        self.max_content_length = max_content_length
        self.user_agent = user_agent

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def crawl(self, url: str) -> CrawlResult:
        """Crawl a URL and return Markdown content (async)."""
        err = check_url_ssrf(url)
        if err:
            return CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0)
        t0 = time.monotonic()
        try:
            return await self._crawl_with_crawl4ai(url, t0)
        except Exception as exc:  # noqa: BLE001
            logger.debug("Crawl4AI failed for %s (%s), trying urllib fallback", url, exc)
            try:
                return self._crawl_with_urllib(url, t0)
            except Exception as exc2:  # noqa: BLE001
                elapsed = time.monotonic() - t0
                logger.warning("All crawl backends failed for %s: %s", url, exc2)
                return CrawlResult(url=url, success=False, error=str(exc2), elapsed_seconds=elapsed)

    def crawl_sync(self, url: str) -> CrawlResult:
        """Synchronous crawl — tries Crawl4AI via asyncio.run, falls back to urllib."""
        err = check_url_ssrf(url)
        if err:
            return CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0)
        t0 = time.monotonic()
        try:
            return asyncio.run(self._crawl_with_crawl4ai(url, t0))
        except Exception:  # noqa: BLE001
            try:
                return self._crawl_with_urllib(url, t0)
            except Exception as exc:  # noqa: BLE001
                elapsed = time.monotonic() - t0
                return CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed)

    async def crawl_many(self, urls: list[str]) -> list[CrawlResult]:
        """Crawl multiple URLs using Crawl4AI's async engine."""
        results = []
        try:
            from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig

            browser_config = BrowserConfig(headless=True)
            run_config = CrawlerRunConfig(
                word_count_threshold=10,
                excluded_tags=["nav", "footer", "header", "sidebar"],
                remove_overlay_elements=True,
            )

            async with AsyncWebCrawler(config=browser_config) as crawler:
                for url in urls:
                    err = check_url_ssrf(url)
                    if err:
                        results.append(CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0))
                        continue
                    t0 = time.monotonic()
                    try:
                        raw = await crawler.arun(url=url, config=run_config)
                        elapsed = time.monotonic() - t0
                        if raw.success:
                            md = self._extract_markdown(raw)
                            results.append(CrawlResult(
                                url=url, markdown=md,
                                title=getattr(raw, "title", "") or "",
                                success=True, elapsed_seconds=elapsed,
                                metadata=raw.metadata if hasattr(raw, "metadata") and raw.metadata else {},
                            ))
                        else:
                            results.append(CrawlResult(
                                url=url, success=False,
                                error=getattr(raw, "error_message", "crawl failed"),
                                elapsed_seconds=elapsed,
                            ))
                    except Exception as exc:  # noqa: BLE001
                        elapsed = time.monotonic() - t0
                        results.append(CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed))
        except ImportError:
            # Crawl4AI browser not set up — use urllib for each
            for url in urls:
                err = check_url_ssrf(url)
                if err:
                    results.append(CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0))
                    continue
                t0 = time.monotonic()
                try:
                    results.append(self._crawl_with_urllib(url, t0))
                except Exception as exc:  # noqa: BLE001
                    elapsed = time.monotonic() - t0
                    results.append(CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed))
        return results

    # ------------------------------------------------------------------
    # Crawl4AI backend (primary)
    # ------------------------------------------------------------------

    async def _crawl_with_crawl4ai(self, url: str, t0: float) -> CrawlResult:
        """Use Crawl4AI for high-quality extraction."""
        from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig

        browser_config = BrowserConfig(headless=True)
        run_config = CrawlerRunConfig(
            word_count_threshold=10,
            excluded_tags=["nav", "footer", "header", "sidebar"],
            remove_overlay_elements=True,
        )

        async with AsyncWebCrawler(config=browser_config) as crawler:
            raw = await crawler.arun(url=url, config=run_config)

        elapsed = time.monotonic() - t0
        if raw.success:
            md = self._extract_markdown(raw)
            return CrawlResult(
                url=url, markdown=md,
                title=getattr(raw, "title", "") or "",
                success=True, elapsed_seconds=elapsed,
                metadata=raw.metadata if hasattr(raw, "metadata") and raw.metadata else {},
            )
        return CrawlResult(
            url=url, success=False,
            error=getattr(raw, "error_message", "Unknown crawl4ai error"),
            elapsed_seconds=elapsed,
        )

    def _extract_markdown(self, raw: Any) -> str:
        """Extract markdown from a Crawl4AI result object."""
        # Crawl4AI v0.8+ uses markdown_v2.raw_markdown
        md = ""
        if hasattr(raw, "markdown_v2") and raw.markdown_v2:
            md = getattr(raw.markdown_v2, "raw_markdown", "") or ""
        if not md and hasattr(raw, "markdown"):
            md = raw.markdown or ""
        if len(md) > self.max_content_length:
            md = md[: self.max_content_length] + "\n\n[... truncated]"
        return md

    # ------------------------------------------------------------------
    # urllib fallback (lightweight, no browser needed)
    # ------------------------------------------------------------------

    def _crawl_with_urllib(self, url: str, t0: float) -> CrawlResult:
        """Lightweight fallback: fetch HTML and strip tags."""
        req = Request(url, headers={"User-Agent": self.user_agent})
        resp = urlopen(req, timeout=self.timeout)  # noqa: S310
        content_type = resp.headers.get("Content-Type", "")
        raw = resp.read()

        encoding = "utf-8"
        if "charset=" in content_type:
            encoding = content_type.split("charset=")[-1].split(";")[0].strip()
        html = raw.decode(encoding, errors="replace")

        title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.DOTALL | re.IGNORECASE)
        title = title_match.group(1).strip() if title_match else ""

        markdown = self._html_to_markdown(html)
        if len(markdown) > self.max_content_length:
            markdown = markdown[: self.max_content_length] + "\n\n[... truncated]"

        elapsed = time.monotonic() - t0
        return CrawlResult(
            url=url, markdown=markdown, title=title,
            success=bool(markdown.strip()), elapsed_seconds=elapsed,
        )

    @staticmethod
    def _html_to_markdown(html: str) -> str:
        """Best-effort HTML → Markdown conversion via regex."""
        text = re.sub(r"<(script|style|noscript)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<h1[^>]*>(.*?)</h1>", r"\n# \1\n", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<h2[^>]*>(.*?)</h2>", r"\n## \1\n", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<h3[^>]*>(.*?)</h3>", r"\n### \1\n", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<li[^>]*>(.*?)</li>", r"\n- \1", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<p[^>]*>(.*?)</p>", r"\n\1\n", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
        text = re.sub(r"<a[^>]*href=[\"']([^\"']*)[\"'][^>]*>(.*?)</a>", r"[\2](\1)", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<[^>]+>", "", text)
        import html as _html
        text = _html.unescape(text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r" {2,}", " ", text)
        return text.strip()


================================================
FILE: researchclaw/web/pdf_extractor.py
================================================
"""PDF full-text extraction powered by PyMuPDF (fitz).

PyMuPDF is installed as a dependency and provides fast, high-quality
PDF text extraction with metadata, section detection, and table support.

Usage::

    extractor = PDFExtractor()
    result = extractor.extract("/path/to/paper.pdf")
    print(result.text[:1000])
"""

from __future__ import annotations

import logging
import re
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from urllib.request import Request, urlopen

from researchclaw.web._ssrf import check_url_ssrf

try:
    import fitz  # PyMuPDF
    HAS_FITZ = True
except ImportError:
    fitz = None  # type: ignore[assignment]
    HAS_FITZ = False

logger = logging.getLogger(__name__)


@dataclass
class PDFContent:
    """Extracted content from a PDF file."""

    path: str
    text: str = ""
    title: str = ""
    authors: list[str] = field(default_factory=list)
    abstract: str = ""
    sections: list[dict[str, str]] = field(default_factory=list)
    page_count: int = 0
    success: bool = False
    error: str = ""
    backend: str = "pymupdf"
    metadata: dict[str, Any] = field(default_factory=dict)

    @property
    def has_content(self) -> bool:
        return bool(self.text and len(self.text.strip()) > 100)


class PDFExtractor:
    """PDF text extraction using PyMuPDF.

    Parameters
    ----------
    max_pages:
        Maximum pages to extract (0 = all).
    extract_sections:
        Whether to attempt section boundary detection.
    """

    def __init__(
        self,
        *,
        max_pages: int = 0,
        extract_sections: bool = True,
    ) -> None:
        self.max_pages = max_pages
        self.extract_sections = extract_sections

    @property
    def backend(self) -> str:
        return "pymupdf"

    def extract(self, path: str | Path) -> PDFContent:
        """Extract text from a local PDF file using PyMuPDF."""
        if not HAS_FITZ:
            return PDFContent(
                path=str(path),
                error="PyMuPDF not installed. Install: pip install 'researchclaw[pdf]'",
            )
        path = Path(path)
        try:
            _exists = path.exists()
        except (PermissionError, OSError):
            _exists = False
        if not _exists:
            return PDFContent(path=str(path), error=f"File not found: {path}")

        try:
            with fitz.open(str(path)) as doc:
                pages_to_read = doc.page_count
                if self.max_pages > 0:
                    pages_to_read = min(pages_to_read, self.max_pages)

                all_text = []
                for i in range(pages_to_read):
                    page = doc[i]
                    all_text.append(page.get_text())

                full_text = "\n".join(all_text)

                meta = doc.metadata or {}
                title = meta.get("title", "")
                author = meta.get("author", "")
                authors = [a.strip() for a in author.split(",")] if author else []

                abstract = self._extract_abstract(full_text)
                sections = self._detect_sections(full_text) if self.extract_sections else []

                page_count = doc.page_count

            return PDFContent(
                path=str(path),
                text=full_text,
                title=title,
                authors=authors,
                abstract=abstract,
                sections=sections,
                page_count=page_count,
                success=True,
                metadata=meta,
            )
        except Exception as exc:  # noqa: BLE001
            logger.warning("PDF extraction failed for %s: %s", path, exc)
            return PDFContent(path=str(path), error=str(exc))

    def extract_from_url(self, url: str) -> PDFContent:
        """Download a PDF from URL and extract text."""
        err = check_url_ssrf(url)
        if err:
            return PDFContent(path=url, error=err)

        tmp_path = None
        try:
            req = Request(url, headers={
                "User-Agent": "ResearchClaw/0.5 (Academic Research Bot)"
            })
            resp = urlopen(req, timeout=30)  # noqa: S310
            data = resp.read()

            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
                f.write(data)
                tmp_path = f.name

            result = self.extract(tmp_path)
            result.path = url
            return result
        except Exception as exc:  # noqa: BLE001
            logger.warning("PDF download failed for %s: %s", url, exc)
            return PDFContent(path=url, error=str(exc))
        finally:
            if tmp_path:
                Path(tmp_path).unlink(missing_ok=True)

    # ------------------------------------------------------------------
    # Section detection
    # ------------------------------------------------------------------

    @staticmethod
    def _extract_abstract(text: str) -> str:
        """Extract abstract from paper text."""
        match = re.search(
            r"(?:^|\n)\s*Abstract\s*\n(.*?)(?=\n\s*(?:\d+\.?\s+)?(?:Introduction|1\s))",
            text, re.DOTALL | re.IGNORECASE,
        )
        if match:
            return match.group(1).strip()
        match = re.search(
            r"(?:^|\n)\s*Abstract[:\s]*\n?(.*?)(?:\n\n|\n\s*\n)",
            text, re.DOTALL | re.IGNORECASE,
        )
        if match:
            return match.group(1).strip()
        return ""

    @staticmethod
    def _detect_sections(text: str) -> list[dict[str, str]]:
        """Detect section boundaries in paper text."""
        sections: list[dict[str, str]] = []
        pattern = re.compile(r"(?:^|\n)\s*(\d+\.?\s+[A-Z][^\n]{2,50})\s*\n", re.MULTILINE)
        matches = list(pattern.finditer(text))
        for i, match in enumerate(matches):
            heading = match.group(1).strip()
            start = match.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
            body = text[start:end].strip()
            sections.append({"heading": heading, "text": body[:5000]})
        return sections


================================================
FILE: researchclaw/web/scholar.py
================================================
"""Google Scholar search powered by the ``scholarly`` library.

scholarly is installed as a dependency and provides direct access to
Google Scholar search, citation graph traversal, and author lookup.

Usage::

    client = GoogleScholarClient()
    papers = client.search("attention is all you need", limit=5)
    citing = client.get_citations(papers[0].scholar_id, limit=10)
"""

from __future__ import annotations

import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any

try:
    from scholarly import scholarly, ProxyGenerator
    HAS_SCHOLARLY = True
except ImportError:
    scholarly = None  # type: ignore[assignment]
    ProxyGenerator = None  # type: ignore[assignment,misc]
    HAS_SCHOLARLY = False

logger = logging.getLogger(__name__)


@dataclass
class ScholarPaper:
    """A paper result from Google Scholar."""

    title: str
    authors: list[str] = field(default_factory=list)
    year: int = 0
    abstract: str = ""
    citation_count: int = 0
    url: str = ""
    scholar_id: str = ""
    venue: str = ""
    source: str = "google_scholar"

    def to_dict(self) -> dict[str, Any]:
        return {
            "title": self.title,
            "authors": self.authors,
            "year": self.year,
            "abstract": self.abstract,
            "citation_count": self.citation_count,
            "url": self.url,
            "scholar_id": self.scholar_id,
            "venue": self.venue,
            "source": self.source,
        }

    def to_literature_paper(self) -> Any:
        """Convert to researchclaw.literature.models.Paper."""
        from researchclaw.literature.models import Author, Paper
        authors_tuple = tuple(Author(name=a) for a in self.authors)
        return Paper(
            paper_id=self.scholar_id or f"gs-{hashlib.sha256(self.title.encode()).hexdigest()[:8]}",
            title=self.title,
            authors=authors_tuple,
            year=self.year,
            abstract=self.abstract,
            venue=self.venue,
            citation_count=self.citation_count,
            url=self.url,
            source="google_scholar",
        )


class GoogleScholarClient:
    """Google Scholar search client using the ``scholarly`` library.

    Parameters
    ----------
    inter_request_delay:
        Seconds between requests to avoid rate limiting.
    use_proxy:
        Whether to set up a free proxy to reduce blocking risk.
    """

    def __init__(
        self,
        *,
        inter_request_delay: float = 2.0,
        use_proxy: bool = False,
    ) -> None:
        if not HAS_SCHOLARLY:
            raise ImportError(
                "scholarly is required for Google Scholar search. "
                "Install: pip install 'researchclaw[web]'"
            )
        self.delay = inter_request_delay
        self._last_request_time: float = 0.0

        if use_proxy:
            try:
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                logger.info("Google Scholar: proxy enabled")
            except Exception as exc:  # noqa: BLE001
                logger.warning("Failed to set up proxy: %s", exc)

    @property
    def available(self) -> bool:
        """Always True — scholarly is installed as a dependency."""
        return True

    def search(self, query: str, *, limit: int = 10) -> list[ScholarPaper]:
        """Search Google Scholar for papers matching query."""
        self._rate_limit()
        results: list[ScholarPaper] = []
        try:
            search_gen = scholarly.search_pubs(query)
            for i, pub in enumerate(search_gen):
                if i >= limit:
                    break
                results.append(self._parse_pub(pub))
                if i < limit - 1:
                    self._rate_limit()

            logger.info("Google Scholar: found %d papers for %r", len(results), query)
        except Exception as exc:  # noqa: BLE001
            logger.warning("Google Scholar search failed: %s", exc)

        return results

    def get_citations(self, scholar_id: str, *, limit: int = 20) -> list[ScholarPaper]:
        """Get papers that cite the given paper (citation graph traversal)."""
        self._rate_limit()
        results: list[ScholarPaper] = []
        try:
            pub = scholarly.search_single_pub(scholar_id)
            if pub:
                citations = scholarly.citedby(pub)
                for i, cit in enumerate(citations):
                    if i >= limit:
                        break
                    results.append(self._parse_pub(cit))
                    if i < limit - 1:
                        self._rate_limit()

            logger.info("Google Scholar: found %d citations for %s", len(results), scholar_id)
        except Exception as exc:  # noqa: BLE001
            logger.warning("Citation retrieval failed for %s: %s", scholar_id, exc)

        return results

    def search_author(self, name: str) -> list[dict[str, Any]]:
        """Search for an author on Google Scholar."""
        self._rate_limit()
        try:
            results = []
            for author in scholarly.search_author(name):
                results.append({
                    "name": author.get("name", ""),
                    "affiliation": author.get("affiliation", ""),
                    "scholar_id": author.get("scholar_id", ""),
                    "citedby": author.get("citedby", 0),
                    "interests": author.get("interests", []),
                })
                if len(results) >= 5:
                    break
            return results
        except Exception as exc:  # noqa: BLE001
            logger.warning("Author search failed for %s: %s", name, exc)
            return []

    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------

    def _rate_limit(self) -> None:
        now = time.monotonic()
        elapsed = now - self._last_request_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self._last_request_time = time.monotonic()

    @staticmethod
    def _parse_pub(pub: Any) -> ScholarPaper:
        """Parse a scholarly publication object into ScholarPaper."""
        bib = pub.get("bib", {}) if isinstance(pub, dict) else getattr(pub, "bib", {})
        info = pub if isinstance(pub, dict) else pub.__dict__ if hasattr(pub, "__dict__") else {}

        authors = bib.get("author", [])
        if isinstance(authors, str):
            authors = [a.strip() for a in authors.split(" and ")]

        year = 0
        year_raw = bib.get("pub_year", bib.get("year", 0))
        try:
            year = int(year_raw)
        except (ValueError, TypeError):
            pass

        cites_id = info.get("cites_id", [])
        scholar_id = info.get("author_pub_id", "") or (
            cites_id[0] if isinstance(cites_id, list) and cites_id else ""
        )

        return ScholarPaper(
            title=bib.get("title", ""),
            authors=authors,
            year=year,
            abstract=bib.get("abstract", ""),
            citation_count=info.get("num_citations", 0),
            url=info.get("pub_url", info.get("eprint_url", "")),
            scholar_id=scholar_id,
            venue=bib.get("venue", bib.get("journal", "")),
        )


================================================
FILE: researchclaw/web/search.py
================================================
"""Web search powered by Tavily AI Search API.

Tavily is the primary search engine (installed as a dependency).
A DuckDuckGo HTML scrape fallback exists for when no API key is set.

Usage::

    client = WebSearchClient(api_key="tvly-...")
    results = client.search("knowledge distillation survey 2024")
"""

from __future__ import annotations

import logging
import os
import re
import time
from dataclasses import dataclass, field
from typing import Any
from urllib.request import Request, urlopen
from urllib.parse import quote_plus

logger = logging.getLogger(__name__)


@dataclass
class SearchResult:
    """A single web search result."""

    title: str
    url: str
    snippet: str = ""
    content: str = ""
    score: float = 0.0
    source: str = ""  # "tavily" | "duckduckgo"

    def to_dict(self) -> dict[str, Any]:
        return {
            "title": self.title,
            "url": self.url,
            "snippet": self.snippet,
            "content": self.content,
            "score": self.score,
            "source": self.source,
        }


@dataclass
class WebSearchResponse:
    """Response from a web search query."""

    query: str
    results: list[SearchResult] = field(default_factory=list)
    answer: str = ""  # Tavily can provide a direct AI answer
    elapsed_seconds: float = 0.0
    source: str = ""  # "tavily" | "duckduckgo"

    @property
    def has_results(self) -> bool:
        return len(self.results) > 0


class WebSearchClient:
    """General-purpose web search client.

    Uses Tavily (installed) as primary engine. Falls back to DuckDuckGo
    HTML scraping only if no Tavily API key is available.

    Parameters
    ----------
    api_key:
        Tavily API key. Falls back to ``TAVILY_API_KEY`` env var.
    max_results:
        Default number of results per query.
    search_depth:
        Tavily search depth: "basic" or "advanced".
    include_answer:
        Whether to request Tavily's AI-generated answer.
    """

    def __init__(
        self,
        *,
        api_key: str = "",
        max_results: int = 10,
        search_depth: str = "advanced",
        include_answer: bool = True,
    ) -> None:
        self.api_key = api_key or os.environ.get("TAVILY_API_KEY", "")
        self.max_results = max_results
        self.search_depth = search_depth
        self.include_answer = include_answer

    def search(
        self,
        query: str,
        *,
        max_results: int | None = None,
        include_domains: list[str] | None = None,
        exclude_domains: list[str] | None = None,
    ) -> WebSearchResponse:
        """Search the web for a query."""
        limit = max_results or self.max_results
        t0 = time.monotonic()

        # Tavily is the primary engine
        if self.api_key:
            try:
                return self._search_tavily(query, limit, include_domains, exclude_domains, t0)
            except Exception as exc:  # noqa: BLE001
                logger.warning("Tavily search failed, falling back to DuckDuckGo: %s", exc)

        return self._search_duckduckgo(query, limit, t0)

    def search_multi(
        self,
        queries: list[str],
        *,
        max_results: int | None = None,
        inter_query_delay: float = 1.0,
    ) -> list[WebSearchResponse]:
        """Run multiple search queries with cross-query deduplication."""
        responses = []
        seen_urls: set[str] = set()

        for i, query in enumerate(queries):
            if i > 0:
                time.sleep(inter_query_delay)
            resp = self.search(query, max_results=max_results)
            unique_results = [r for r in resp.results if r.url not in seen_urls]
            seen_urls.update(r.url for r in unique_results)
            resp.results = unique_results
            responses.append(resp)

        return responses

    # ------------------------------------------------------------------
    # Tavily backend (primary — uses installed tavily-python SDK)
    # ------------------------------------------------------------------

    def _search_tavily(
        self,
        query: str,
        limit: int,
        include_domains: list[str] | None,
        exclude_domains: list[str] | None,
        t0: float,
    ) -> WebSearchResponse:
        """Search using Tavily API (installed SDK)."""
        from tavily import TavilyClient

        client = TavilyClient(api_key=self.api_key)

        kwargs: dict[str, Any] = {
            "query": query,
            "max_results": limit,
            "search_depth": self.search_depth,
            "include_answer": self.include_answer,
        }
        if include_domains:
            kwargs["include_domains"] = include_domains
        if exclude_domains:
            kwargs["exclude_domains"] = exclude_domains

        response = client.search(**kwargs)
        elapsed = time.monotonic() - t0

        results = []
        for item in response.get("results", []):
            results.append(SearchResult(
                title=item.get("title", ""),
                url=item.get("url", ""),
                snippet=item.get("content", "")[:500],
                content=item.get("content", ""),
                score=item.get("score", 0.0),
                source="tavily",
            ))

        return WebSearchResponse(
            query=query,
            results=results,
            answer=response.get("answer", ""),
            elapsed_seconds=elapsed,
            source="tavily",
        )

    # ------------------------------------------------------------------
    # DuckDuckGo fallback (no API key needed)
    # ------------------------------------------------------------------

    def _search_duckduckgo(
        self, query: str, limit: int, t0: float
    ) -> WebSearchResponse:
        """Fallback: scrape DuckDuckGo HTML search results."""
        encoded = quote_plus(query)
        url = f"https://html.duckduckgo.com/html/?q={encoded}"
        req = Request(url, headers={
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/120.0.0.0 Safari/537.36",
        })

        try:
            resp = urlopen(req, timeout=15)  # noqa: S310
            html = resp.read().decode("utf-8", errors="replace")
        except Exception as exc:  # noqa: BLE001
            elapsed = time.monotonic() - t0
            logger.warning("DuckDuckGo search failed: %s", exc)
            return WebSearchResponse(query=query, elapsed_seconds=elapsed, source="duckduckgo")

        results = self._parse_ddg_html(html, limit)
        elapsed = time.monotonic() - t0
        return WebSearchResponse(query=query, results=results, elapsed_seconds=elapsed, source="duckduckgo")

    @staticmethod
    def _parse_ddg_html(html: str, limit: int) -> list[SearchResult]:
        """Parse DuckDuckGo HTML results page."""
        results = []
        link_pattern = re.compile(
            r'<a[^>]*class="result__a"[^>]*href="([^"]*)"[^>]*>(.*?)</a>', re.DOTALL,
        )
        snippet_pattern = re.compile(
            r'<a[^>]*class="result__snippet"[^>]*>(.*?)</a>', re.DOTALL,
        )

        links = link_pattern.findall(html)
        snippets = snippet_pattern.findall(html)

        for i, (url, title_html) in enumerate(links[:limit]):
            title = re.sub(r"<[^>]+>", "", title_html).strip()
            snippet = re.sub(r"<[^>]+>", "", snippets[i]).strip() if i < len(snippets) else ""
            if "duckduckgo.com" in url:
                # Extract actual URL from DDG redirect: //duckduckgo.com/l/?uddg=https%3A...
                from urllib.parse import urlparse as _urlparse, parse_qs as _parse_qs, unquote as _unquote
                _parsed_ddg = _urlparse(url)
                _uddg = _parse_qs(_parsed_ddg.query).get("uddg")
                if _uddg:
                    url = _unquote(_uddg[0])
                else:
                    continue
            results.append(SearchResult(title=title, url=url, snippet=snippet, source="duckduckgo"))

        return results


================================================
FILE: researchclaw/wizard/__init__.py
================================================
"""Setup wizard modules."""


================================================
FILE: researchclaw/wizard/quickstart.py
================================================
"""Quick-start interactive setup wizard."""

from __future__ import annotations

import sys
from typing import Any

from researchclaw.wizard.templates import TEMPLATES


class QuickStartWizard:
    """Interactive configuration generator."""

    def run_interactive(self, template: str | None = None) -> dict[str, Any]:
        """CLI interactive wizard — returns a config dict."""
        print("\n=== ResearchClaw Setup Wizard ===\n")

        if template:
            return self._apply_template(template)

        config: dict[str, Any] = {}

        # 1. Project name
        name = self._ask("Project name", default="my-research")
        config["project"] = {"name": name, "mode": "full-auto"}

        # 2. Research topic
        topic = self._ask("Research topic (describe in one sentence)")
        if not topic:
            print("Topic is required.")
            return {}
        config["research"] = {"topic": topic}

        # 3. Research domain
        domains_str = self._ask(
            "Research domains (comma-separated: cv, nlp, rl, ml, ai4science)",
            default="ml",
        )
        config["research"]["domains"] = [
            d.strip() for d in domains_str.split(",") if d.strip()
        ]

        # 4. Experiment mode
        mode = self._choose(
            "Experiment mode",
            ["simulated", "docker", "sandbox"],
            default="docker",
        )
        config["experiment"] = {"mode": mode}

        if mode == "docker":
            gpu = self._ask_yn("Enable GPU?", default=True)
            config["experiment"]["docker"] = {
                "gpu_enabled": gpu,
                "network_policy": "setup_only",
            }
            budget = self._ask("Time budget (seconds)", default="600")
            config["experiment"]["time_budget_sec"] = int(budget)

        # 5. LLM provider
        print("\n--- LLM Configuration ---")
        provider = self._choose(
            "LLM provider",
            ["openai-compatible", "acp"],
            default="openai-compatible",
        )
        config["llm"] = {"provider": provider}

        if provider == "openai-compatible":
            base_url = self._ask("API base URL", default="https://api.openai.com/v1")
            api_key_env = self._ask("API key env var", default="OPENAI_API_KEY")
            model = self._ask("Model name", default="gpt-4o")
            config["llm"].update({
                "base_url": base_url,
                "api_key_env": api_key_env,
                "primary_model": model,
            })

        # 6. Output format
        conference = self._choose(
            "Target conference format",
            ["neurips_2025", "iclr_2025", "icml_2025", "arxiv"],
            default="neurips_2025",
        )
        config["export"] = {"target_conference": conference}

        # 7. Runtime
        config["runtime"] = {"timezone": "UTC"}
        config["notifications"] = {"channel": "console"}
        config["knowledge_base"] = {"backend": "markdown", "root": "knowledge"}

        print("\n--- Configuration Summary ---")
        self._print_summary(config)
        confirm = self._ask_yn("\nSave this configuration?", default=True)
        if not confirm:
            print("Cancelled.")
            return {}

        return config

    def run_web(self, steps: list[dict[str, Any]]) -> dict[str, Any]:
        """Process wizard steps from web interface."""
        config: dict[str, Any] = {}
        for step in steps:
            key = step.get("key", "")
            value = step.get("value", "")
            if key == "project_name":
                config.setdefault("project", {})["name"] = value
            elif key == "topic":
                config.setdefault("research", {})["topic"] = value
            elif key == "mode":
                config.setdefault("experiment", {})["mode"] = value
            elif key == "model":
                config.setdefault("llm", {})["primary_model"] = value
        return config

    def _apply_template(self, name: str) -> dict[str, Any]:
        """Apply a preset template."""
        mapping = {
            "quick": "quick-demo",
            "standard": "standard-cv",
            "advanced": "deep-nlp",
        }
        tpl_name = mapping.get(name, name)
        tpl = TEMPLATES.get(tpl_name)
        if not tpl:
            print(f"Unknown template: {name}")
            return {}

        config = self._template_to_config(tpl)
        print(f"Applied template: {tpl_name}")
        print(f"  Description: {tpl.get('description', '')}")
        self._print_summary(config)
        return config

    def _template_to_config(self, tpl: dict[str, Any]) -> dict[str, Any]:
        """Convert a flat template to nested config dict."""
        config: dict[str, Any] = {
            "project": {"name": "wizard-project", "mode": "full-auto"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "console"},
            "knowledge_base": {"backend": "markdown", "root": "knowledge"},
            "research": {"topic": "Generated by wizard"},
            "llm": {"provider": "openai-compatible", "api_key_env": "OPENAI_API_KEY"},
        }

        for key, value in tpl.items():
            if key == "description":
                continue
            parts = key.split(".")
            d = config
            for p in parts[:-1]:
                d = d.setdefault(p, {})
            d[parts[-1]] = value

        return config

    def _ask(self, prompt: str, default: str = "") -> str:
        suffix = f" [{default}]" if default else ""
        try:
            answer = input(f"  {prompt}{suffix}: ").strip()
        except (EOFError, KeyboardInterrupt):
            print()
            return default
        return answer or default

    def _ask_yn(self, prompt: str, default: bool = True) -> bool:
        suffix = " [Y/n]" if default else " [y/N]"
        try:
            answer = input(f"  {prompt}{suffix}: ").strip().lower()
        except (EOFError, KeyboardInterrupt):
            print()
            return default
        if not answer:
            return default
        return answer in ("y", "yes", "1", "true")

    def _choose(
        self,
        prompt: str,
        options: list[str],
        default: str = "",
    ) -> str:
        print(f"  {prompt}:")
        for i, opt in enumerate(options, 1):
            marker = " *" if opt == default else ""
            print(f"    {i}. {opt}{marker}")
        try:
            answer = input(f"  Choice [default={default}]: ").strip()
        except (EOFError, KeyboardInterrupt):
            print()
            return default
        if not answer:
            return default
        try:
            idx = int(answer) - 1
            if 0 <= idx < len(options):
                return options[idx]
        except ValueError:
            if answer in options:
                return answer
        return default

    def _print_summary(self, config: dict[str, Any], indent: int = 2) -> None:
        import yaml

        print(yaml.dump(config, default_flow_style=False, allow_unicode=True))


================================================
FILE: researchclaw/wizard/templates.py
================================================
"""Preset research configuration templates."""

from __future__ import annotations

from typing import Any

TEMPLATES: dict[str, dict[str, Any]] = {
    "quick-demo": {
        "description": "5-minute quick demo (simulated mode, no GPU needed)",
        "experiment.mode": "simulated",
        "experiment.time_budget_sec": 60,
        "experiment.max_iterations": 3,
    },
    "standard-cv": {
        "description": "Standard Computer Vision paper (Docker + CIFAR-10)",
        "research.domains": ["computer-vision"],
        "experiment.mode": "docker",
        "experiment.time_budget_sec": 600,
        "experiment.docker.gpu_enabled": True,
        "experiment.docker.network_policy": "setup_only",
    },
    "deep-nlp": {
        "description": "Deep NLP research (Docker + GPU + transformers)",
        "research.domains": ["nlp", "transformers"],
        "experiment.mode": "docker",
        "experiment.time_budget_sec": 1200,
        "experiment.docker.gpu_enabled": True,
        "experiment.docker.memory_limit_mb": 16384,
    },
    "rl-research": {
        "description": "Reinforcement Learning research (Docker + custom env)",
        "research.domains": ["reinforcement-learning"],
        "experiment.mode": "docker",
        "experiment.time_budget_sec": 900,
        "experiment.docker.gpu_enabled": True,
    },
    "ai4science": {
        "description": "AI for Science (large compute budget)",
        "research.domains": ["ai4science"],
        "experiment.mode": "docker",
        "experiment.time_budget_sec": 1800,
        "experiment.docker.gpu_enabled": True,
        "experiment.docker.memory_limit_mb": 32768,
    },
}


def get_template(name: str) -> dict[str, Any] | None:
    """Get a template by name."""
    return TEMPLATES.get(name)


def list_templates() -> list[dict[str, str]]:
    """List all available templates with descriptions."""
    return [
        {"name": name, "description": tpl.get("description", "")}
        for name, tpl in TEMPLATES.items()
    ]


================================================
FILE: researchclaw/wizard/validator.py
================================================
"""Environment detection and recommendation for the setup wizard."""

from __future__ import annotations

import os
import shutil
from dataclasses import dataclass, field
from typing import Any


@dataclass
class EnvironmentReport:
    """Report of detected environment capabilities."""

    has_gpu: bool = False
    gpu_name: str = ""
    gpu_vram_gb: float = 0.0
    has_docker: bool = False
    docker_version: str = ""
    has_python: bool = True
    python_version: str = ""
    has_latex: bool = False
    available_memory_gb: float = 0.0
    recommendations: list[str] = field(default_factory=list)

    def to_dict(self) -> dict[str, Any]:
        return {
            "has_gpu": self.has_gpu,
            "gpu_name": self.gpu_name,
            "gpu_vram_gb": self.gpu_vram_gb,
            "has_docker": self.has_docker,
            "docker_version": self.docker_version,
            "has_python": self.has_python,
            "python_version": self.python_version,
            "has_latex": self.has_latex,
            "available_memory_gb": round(self.available_memory_gb, 1),
            "recommendations": self.recommendations,
        }


def detect_environment() -> EnvironmentReport:
    """Detect local environment and generate recommendations."""
    import sys
    import subprocess

    report = EnvironmentReport()
    report.python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"

    # Docker
    if shutil.which("docker"):
        report.has_docker = True
        try:
            result = subprocess.run(
                ["docker", "--version"],
                capture_output=True, text=True, timeout=5
            )
            report.docker_version = result.stdout.strip()
        except Exception:
            pass

    # GPU
    try:
        import torch
        if torch.cuda.is_available():
            report.has_gpu = True
            report.gpu_name = torch.cuda.get_device_name(0)
            report.gpu_vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    except ImportError:
        pass

    # LaTeX
    report.has_latex = shutil.which("pdflatex") is not None

    # Memory
    try:
        import psutil
        report.available_memory_gb = psutil.virtual_memory().available / (1024**3)
    except ImportError:
        pass

    # Recommendations
    if not report.has_docker:
        report.recommendations.append(
            "Install Docker for experiment isolation (recommended)"
        )
    if not report.has_gpu:
        report.recommendations.append(
            "No GPU detected — use 'simulated' mode or remote GPU server"
        )
    if not report.has_latex:
        report.recommendations.append(
            "Install LaTeX (texlive) for PDF paper export"
        )

    if report.has_gpu and report.has_docker:
        report.recommendations.append(
            "Environment ready for full Docker GPU experiments"
        )

    return report


================================================
FILE: researchclaw/writing_guide.py
================================================
"""Conference-grade writing knowledge base.

Structured tips from NeurIPS/ICML/ICLR best practices, reviewer feedback
analysis, and accepted paper patterns. Can be loaded and injected into
prompts at runtime, allowing updates without modifying prompt YAML.
"""

from __future__ import annotations

CONFERENCE_WRITING_TIPS: dict[str, list[str]] = {
    "title": [
        "Signal novelty — title should hint at what is new",
        "Be specific and concrete, under 15 words",
        "No abbreviations unless universally known",
        "Pattern: '[Finding]: [Evidence]' or '[Method]: [What it does]'",
        "Memeability test: would a reader enjoy telling a colleague about this?",
    ],
    "abstract": [
        "5-sentence structure: (1) problem, (2) prior approaches + limitations, "
        "(3) your approach + novelty, (4) key results with numbers, (5) implication",
        "150-250 words for ML conferences",
        "Include at least 2 specific quantitative results",
    ],
    "figure_1": [
        "Most important figure in the paper — many readers look at Figure 1 first",
        "Should convey the key idea or main result at a glance",
        "Invest significant time in this figure",
    ],
    "introduction": [
        "State contributions clearly as bullet points",
        "Many reviewers stop reading carefully after the intro",
        "Include paper organization paragraph at the end",
    ],
    "experiments": [
        "Strong baselines: tune baselines with the same effort as your method",
        "Ablations: remove one component at a time and measure the effect",
        "Reproducibility: include hyperparameters, seeds, hardware specs",
        "Statistical rigor: report variance, run multiple seeds",
    ],
    "common_rejections": [
        "Weak baselines (79% of rejected papers)",
        "Missing ablations",
        "Overclaiming beyond evidence",
        "Poor reproducibility details",
        "Ignoring limitations",
    ],
    "rebuttal": [
        "Start with positives reviewers identified",
        "Quote reviewers directly, then respond",
        "Provide new data/experiments rather than arguing",
        "Do not promise — deliver",
    ],
}


def format_writing_tips(categories: list[str] | None = None) -> str:
    """Format writing tips as a prompt-injectable string.

    Parameters
    ----------
    categories:
        Subset of tip categories to include. If *None*, include all.

    Returns
    -------
    str
        Formatted markdown-style tips block.
    """
    lines: list[str] = ["## Conference Writing Best Practices"]
    cats = categories or list(CONFERENCE_WRITING_TIPS.keys())
    for cat in cats:
        tips = CONFERENCE_WRITING_TIPS.get(cat, [])
        if not tips:
            continue
        lines.append(f"\n### {cat.replace('_', ' ').title()}")
        for tip in tips:
            lines.append(f"- {tip}")
    return "\n".join(lines)


================================================
FILE: scripts/metaclaw_start.sh
================================================
#!/bin/bash
# Start MetaClaw proxy for AutoResearchClaw integration.
#
# Usage:
#   ./scripts/metaclaw_start.sh              # skills_only mode (default)
#   ./scripts/metaclaw_start.sh madmax       # madmax mode (with RL training)
#   ./scripts/metaclaw_start.sh skills_only  # skills_only mode (explicit)

set -e

MODE="${1:-skills_only}"
PORT="${2:-30000}"

METACLAW_DIR="/home/jqliu/projects/MetaClaw"
VENV="$METACLAW_DIR/.venv"

if [ ! -d "$VENV" ]; then
    echo "ERROR: MetaClaw venv not found at $VENV"
    echo "Run: cd $METACLAW_DIR && python -m venv .venv && source .venv/bin/activate && pip install -e '.[evolve,embedding]'"
    exit 1
fi

echo "Starting MetaClaw in ${MODE} mode on port ${PORT}..."

# Activate venv and start
source "$VENV/bin/activate"
exec metaclaw start --mode "$MODE" --port "$PORT"


================================================
FILE: scripts/plot_iteration_showcase.py
================================================
"""Generate promotional figure: Pipeline iterative improvement showcase.

Shows two experiment cases side-by-side demonstrating how the AutoResearchClaw
pipeline progressively improves experimental methods through self-iteration.
"""

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from pathlib import Path

# ── Styling ──────────────────────────────────────────────────────────────────
plt.rcParams.update({
    "font.family": "serif",
    "font.size": 11,
    "axes.titlesize": 13,
    "axes.labelsize": 11,
    "figure.facecolor": "white",
    "axes.facecolor": "#FAFAFA",
    "axes.grid": True,
    "grid.alpha": 0.3,
    "grid.linestyle": "--",
})

BLUE = "#1565C0"
GREEN = "#2E7D32"
RED = "#C62828"
ORANGE = "#E65100"
PURPLE = "#6A1B9A"
GRAY = "#757575"

# ── Data ─────────────────────────────────────────────────────────────────────

# Case 1: Continual Meta-Learning for Few-Shot Adaptation
case1_iters = [0, 1, 2, 3, 4]
case1_labels = [
    "Baseline\n(Initial Code)",
    "Iter 1\n(Deep Encoder\n+ Meta-SGD)",
    "Iter 2\n(Prototype Net\n— Regression)",
    "Iter 3\n(Linear Clf\n+ L2 Anchor)",
    "Iter 4\n(Converged)",
]
case1_error = [0.7411, 0.1883, 0.2249, 0.0663, 0.0656]
case1_accuracy = [100 * (1 - e) for e in case1_error]
# Marker styles: green=improved, red=regressed, gray=no change
case1_colors = [GRAY, GREEN, RED, GREEN, GRAY]
case1_improved = [None, True, False, True, None]

# Case 2: RLHF + Curriculum-Based Reward Shaping
case2_iters = [0, 1, 2, 3, 4]
case2_labels = [
    "Baseline\n(Vanilla PPO)",
    "Iter 1\n(No Change)",
    "Iter 2\n(+Reward Model\n+Curriculum)",
    "Iter 3\n(+Rank-Norm\n+Policy EMA)",
    "Iter 4\n(+Confidence\nGating)",
]
case2_error = [0.6443, 0.6443, 0.3843, 0.3696, 0.3344]
case2_alignment = [100 * (1 - e) for e in case2_error]
case2_colors = [GRAY, GRAY, GREEN, GREEN, GREEN]

# ── Figure ───────────────────────────────────────────────────────────────────

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# ── Case 1: Meta-Learning ───────────────────────────────────────────────────

# Main line
ax1.plot(case1_iters, case1_accuracy, "o-", color=BLUE, linewidth=2.5,
         markersize=10, zorder=5, label="Few-Shot Accuracy")

# Colored markers for improvement status
for i, (x, y, c) in enumerate(zip(case1_iters, case1_accuracy, case1_colors)):
    ax1.scatter(x, y, s=120, color=c, zorder=6, edgecolors="white", linewidths=1.5)

# Annotate key improvements
ax1.annotate(
    "+55.3 pts\nDeep encoder\n+ context-gated replay",
    xy=(1, case1_accuracy[1]), xytext=(1.3, 55),
    fontsize=8.5, color=GREEN, fontweight="bold",
    arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5),
    ha="left",
)
ax1.annotate(
    "Prototype net\ntoo simple",
    xy=(2, case1_accuracy[2]), xytext=(2.25, 65),
    fontsize=8, color=RED, fontstyle="italic",
    arrowprops=dict(arrowstyle="->", color=RED, lw=1.2),
    ha="left",
)
ax1.annotate(
    "+15.9 pts\nLinear clf + L2 anchor\n+ cosine gating",
    xy=(3, case1_accuracy[3]), xytext=(2.5, 98),
    fontsize=8.5, color=GREEN, fontweight="bold",
    arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5),
    ha="left",
)

# Reference line for "ideal" performance
ax1.axhline(y=100, color=ORANGE, linestyle=":", alpha=0.6, linewidth=1.5)
ax1.text(4.3, 99, "Oracle (100%)", fontsize=8, color=ORANGE, ha="right",
         fontstyle="italic", va="top")

# Shaded improvement region
ax1.fill_between(case1_iters, case1_accuracy, case1_accuracy[0],
                 where=[a >= case1_accuracy[0] for a in case1_accuracy],
                 alpha=0.08, color=BLUE)

ax1.set_xlabel("Self-Iteration Round", fontsize=12)
ax1.set_ylabel("Few-Shot Accuracy (%)", fontsize=12)
ax1.set_title("Case A: Continual Meta-Learning\nfor Few-Shot Adaptation", fontsize=13,
              fontweight="bold", pad=12)
ax1.set_ylim(15, 105)
ax1.set_xticks(case1_iters)
ax1.set_xticklabels(case1_labels, fontsize=7.5, ha="center")

# Summary box
summary1 = f"Baseline: {case1_accuracy[0]:.1f}%  →  Best: {case1_accuracy[3]:.1f}%\nImprovement: +{case1_accuracy[3]-case1_accuracy[0]:.1f} pts ({(case1_accuracy[3]-case1_accuracy[0])/case1_accuracy[0]*100:.0f}% rel.)"
ax1.text(0.02, 0.97, summary1, transform=ax1.transAxes, fontsize=9,
         verticalalignment="top", fontfamily="monospace",
         bbox=dict(boxstyle="round,pad=0.5", facecolor="#E3F2FD", alpha=0.9,
                   edgecolor=BLUE, linewidth=1.2))

# ── Case 2: RLHF ────────────────────────────────────────────────────────────

ax2.plot(case2_iters, case2_alignment, "s-", color=PURPLE, linewidth=2.5,
         markersize=10, zorder=5, label="Alignment Score")

for i, (x, y, c) in enumerate(zip(case2_iters, case2_alignment, case2_colors)):
    ax2.scatter(x, y, s=120, color=c, zorder=6, edgecolors="white", linewidths=1.5,
                marker="s")

# Annotate
ax2.annotate(
    "No improvement\n(minor code fix)",
    xy=(1, case2_alignment[1]), xytext=(1.3, 30),
    fontsize=8, color=GRAY, fontstyle="italic",
    arrowprops=dict(arrowstyle="->", color=GRAY, lw=1.2),
    ha="left",
)
ax2.annotate(
    "+26.0 pts\n+Learned reward model\n+Curriculum scheduling",
    xy=(2, case2_alignment[2]), xytext=(1.8, 75),
    fontsize=8.5, color=GREEN, fontweight="bold",
    arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5),
    ha="left",
)
ax2.annotate(
    "+1.4 pts\n+Rank-norm\n+Policy EMA",
    xy=(3, case2_alignment[3]), xytext=(3.2, 73),
    fontsize=8, color=GREEN,
    arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.2),
    ha="left",
)
ax2.annotate(
    "+3.6 pts\n+Confidence gating\n+Mini-batch RM",
    xy=(4, case2_alignment[4]), xytext=(3.5, 80),
    fontsize=8.5, color=GREEN, fontweight="bold",
    arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5),
    ha="left",
)

# Shaded improvement
ax2.fill_between(case2_iters, case2_alignment, case2_alignment[0],
                 where=[a >= case2_alignment[0] for a in case2_alignment],
                 alpha=0.08, color=PURPLE)

ax2.set_xlabel("Self-Iteration Round", fontsize=12)
ax2.set_ylabel("LLM Alignment Score (%)", fontsize=12)
ax2.set_title("Case B: RLHF with Curriculum-Based\nReward Shaping for LLM Alignment", fontsize=13,
              fontweight="bold", pad=12)
ax2.set_ylim(15, 105)
ax2.set_xticks(case2_iters)
ax2.set_xticklabels(case2_labels, fontsize=7.5, ha="center")

summary2 = f"Baseline: {case2_alignment[0]:.1f}%  →  Best: {case2_alignment[4]:.1f}%\nImprovement: +{case2_alignment[4]-case2_alignment[0]:.1f} pts ({(case2_alignment[4]-case2_alignment[0])/case2_alignment[0]*100:.0f}% rel.)"
ax2.text(0.02, 0.97, summary2, transform=ax2.transAxes, fontsize=9,
         verticalalignment="top", fontfamily="monospace",
         bbox=dict(boxstyle="round,pad=0.5", facecolor="#F3E5F5", alpha=0.9,
                   edgecolor=PURPLE, linewidth=1.2))

# ── Legend ───────────────────────────────────────────────────────────────────
legend_elements = [
    mpatches.Patch(facecolor=GREEN, edgecolor="white", label="Improved"),
    mpatches.Patch(facecolor=RED, edgecolor="white", label="Regressed (auto-recovered)"),
    mpatches.Patch(facecolor=GRAY, edgecolor="white", label="No change / Baseline"),
]
fig.legend(handles=legend_elements, loc="lower center", ncol=3,
           fontsize=10, frameon=True, fancybox=True, framealpha=0.9,
           bbox_to_anchor=(0.5, -0.02))

# ── Suptitle ─────────────────────────────────────────────────────────────────
fig.suptitle(
    "AutoResearchClaw: Autonomous Self-Iterating Experiment Optimization",
    fontsize=15, fontweight="bold", y=1.02,
)

fig.tight_layout(rect=[0, 0.04, 1, 0.98])

# ── Save ─────────────────────────────────────────────────────────────────────
out_dir = Path(__file__).resolve().parent.parent / "docs" / "figures"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "iteration_improvement_showcase.png"
fig.savefig(out_path, dpi=200, bbox_inches="tight", facecolor="white")
print(f"Saved: {out_path}")

# Also save a PDF version for papers
pdf_path = out_dir / "iteration_improvement_showcase.pdf"
fig.savefig(pdf_path, bbox_inches="tight", facecolor="white")
print(f"Saved: {pdf_path}")

plt.close(fig)


================================================
FILE: scripts/test_beast_mode_e2e.py
================================================
#!/usr/bin/env python3
"""End-to-end integration test for OpenCode Beast Mode.

Simulates Pipeline stages 1-9 artifacts, then invokes Beast Mode
to generate experiment code via OpenCode CLI.

Usage:
    python scripts/test_beast_mode_e2e.py
"""

from __future__ import annotations

import json
import sys
import textwrap
import time
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from researchclaw.pipeline.opencode_bridge import (
    OpenCodeBridge,
    count_historical_failures,
    score_complexity,
)

# ============================================================
# Simulated Pipeline Artifacts
# ============================================================

TOPIC = (
    "Adaptive Mixtures of Local Experts for Image Classification: "
    "Dynamic Gating with Load-Balanced Sparse Routing on CIFAR-10"
)

# Simulated Stage 9 output: exp_plan.yaml content
EXP_PLAN = textwrap.dedent("""\
    topic: >
      Adaptive Mixtures of Local Experts for Image Classification:
      Dynamic Gating with Load-Balanced Sparse Routing on CIFAR-10

    objectives:
      - Investigate whether sparse Mixture-of-Experts (MoE) routing improves
        accuracy over dense baselines under a fixed parameter budget
      - Compare top-k routing vs soft routing vs hash-based routing
      - Ablate the load-balancing auxiliary loss

    datasets:
      - CIFAR-10 (pre-cached at /opt/datasets/cifar10)

    baselines:
      - name: dense_resnet18
        description: Standard ResNet-18 with all parameters active
        implementation_spec:
          class_name: DenseResNet18Trainer
          key_hyperparameters:
            batch_size: 128
            learning_rate: 0.1
            epochs: 20
            weight_decay: 5e-4

      - name: dense_wider_resnet
        description: Wider ResNet with ~same FLOPs as MoE model
        implementation_spec:
          class_name: DenseWiderResNetTrainer
          key_hyperparameters:
            batch_size: 128
            learning_rate: 0.1
            epochs: 20

    proposed_methods:
      - name: topk_sparse_moe
        description: >
          Sparse MoE with top-2 gating. Each MoE layer has 4 expert MLPs,
          a gating network selects top-2 per token. Load-balancing loss
          ensures even expert utilization.
        implementation_spec:
          class_name: TopKSparseMoETrainer
          algorithm_steps:
            - Build backbone CNN (first 3 ResNet blocks)
            - Replace final block with MoE layer (4 experts, top-2 gating)
            - Gating network: linear projection → softmax → top-k selection
            - Load-balance loss: CV of expert load across batch
            - Total loss = CE + lambda_lb * load_balance_loss
          key_hyperparameters:
            batch_size: 128
            learning_rate: 0.05
            epochs: 20
            num_experts: 4
            top_k: 2
            lambda_lb: 0.01

      - name: soft_routing_moe
        description: >
          Soft MoE where all experts contribute with learned weights
          (no hard top-k). Softer gradient flow but higher compute.
        implementation_spec:
          class_name: SoftRoutingMoETrainer
          key_hyperparameters:
            batch_size: 128
            learning_rate: 0.05
            epochs: 20
            num_experts: 4

    ablations:
      - name: topk_moe_no_load_balance
        description: TopK MoE without load-balancing loss (lambda_lb=0)
        what_is_removed: Load-balancing auxiliary loss
        expected_effect: Expert collapse — one expert dominates, accuracy drops
        how_it_differs:
          - Set lambda_lb = 0
          - Everything else identical to topk_sparse_moe

      - name: topk_moe_single_expert
        description: TopK MoE with top_k=1 (only one expert per sample)
        what_is_removed: Multi-expert routing (reduced to single expert)
        expected_effect: Reduced model capacity per sample, likely lower accuracy
        how_it_differs:
          - Set top_k = 1 instead of 2
          - Keep load-balancing loss active

    metrics:
      primary_metric:
        name: test_accuracy
        direction: maximize
        description: Classification accuracy on CIFAR-10 test set
      secondary_metrics:
        - name: expert_utilization_cv
          description: Coefficient of variation of expert usage (lower = more balanced)
        - name: training_time_sec
          description: Wall-clock training time

    compute_budget:
      effective_time_seconds: 240
      estimated_seconds_per_run: 40
      seeds_per_condition: 3
      total_conditions: 6
      notes:
        - Use small models (< 5M params) to fit within budget
        - Use 20 epochs max
        - Early stopping if no improvement for 5 epochs
""")

PKG_HINT = textwrap.dedent("""\
    AVAILABLE PACKAGES (docker mode): Python stdlib, numpy, torch, sklearn, scipy, pandas,
    torchvision, torchaudio, matplotlib, seaborn, scipy, tqdm, transformers, datasets,
    timm, einops, torchmetrics, and additional pip-installable packages via requirements.txt.
    GPU: NVIDIA RTX 6000 Ada (cuda). You MAY use PyTorch with GPU acceleration.
    Use `device = torch.device('cuda')` for tensor operations.

    ## Compute Budget Constraint
    - Total execution time limit: 240 seconds
    - Design experiments that complete within this budget
    - Implement a time guard: stop gracefully at 80% of budget (192 seconds)
""")

EXTRA_GUIDANCE = textwrap.dedent("""\
    ## Dataset Guidance
    CIFAR-10 is pre-cached at /opt/datasets/cifar10.
    Use: torchvision.datasets.CIFAR10(root='/opt/datasets/cifar10', download=False)

    ## Multi-Seed Enforcement
    Run each condition with seeds [0, 1, 2]. Report mean ± std for all metrics.

    ## Hyperparameter Reporting
    Print all hyperparameters at the start of each condition run.
""")


def main() -> None:
    print("=" * 70)
    print("OpenCode Beast Mode — End-to-End Integration Test")
    print("=" * 70)

    # Step 1: Complexity scoring
    print("\n[Step 1] Complexity scoring...")
    cplx = score_complexity(
        exp_plan=EXP_PLAN,
        topic=TOPIC,
        historical_failures=0,
        threshold=0.4,  # Lower threshold to ensure trigger for this test
    )
    print(f"  Score:  {cplx.score:.4f}")
    print(f"  Signals: {json.dumps(cplx.signals, indent=4)}")
    print(f"  Recommendation: {cplx.recommendation}")
    print(f"  Reason: {cplx.reason}")

    if cplx.recommendation != "beast_mode":
        print("\n  [!] Score below threshold. Forcing beast mode for test purposes.\n")

    # Step 2: Check OpenCode availability
    print("\n[Step 2] Checking OpenCode availability...")
    available = OpenCodeBridge.check_available()
    if not available:
        print("  [FATAL] OpenCode CLI not installed. Cannot proceed.")
        sys.exit(1)
    print("  OpenCode CLI: OK")

    # Step 3: Create test workspace and invoke
    print("\n[Step 3] Invoking OpenCode beast mode...")
    test_dir = PROJECT_ROOT / "test_outputs_beast_mode"
    test_dir.mkdir(parents=True, exist_ok=True)
    stage_dir = test_dir / f"stage-10_{int(time.time())}"
    stage_dir.mkdir(parents=True, exist_ok=True)

    # Write complexity analysis
    (stage_dir / "complexity_analysis.json").write_text(
        json.dumps({
            "score": cplx.score,
            "signals": cplx.signals,
            "recommendation": cplx.recommendation,
            "reason": cplx.reason,
        }, indent=2),
        encoding="utf-8",
    )

    # NOTE: Azure AI Services endpoints don't support OpenCode's Responses API.
    # The bridge auto-detects Azure and falls back to Anthropic provider.
    bridge = OpenCodeBridge(
        model="anthropic/claude-sonnet-4-6",  # Direct Anthropic model
        llm_base_url="https://huaxi-mlg4x1rk-eastus2.services.ai.azure.com/openai/v1",
        api_key_env="AZURE_OPENAI_API_KEY",
        llm_provider="azure",
        timeout_sec=300,
        max_retries=1,
        workspace_cleanup=False,  # Keep workspace for inspection
    )

    t0 = time.time()
    result = bridge.generate(
        stage_dir=stage_dir,
        topic=TOPIC,
        exp_plan=EXP_PLAN,
        metric="test_accuracy",
        pkg_hint=PKG_HINT,
        extra_guidance=EXTRA_GUIDANCE,
        time_budget_sec=240,
    )
    elapsed = time.time() - t0

    # Step 4: Evaluate results
    print(f"\n[Step 4] Results (elapsed: {elapsed:.1f}s)")
    print(f"  Success: {result.success}")
    print(f"  Error: {result.error or 'None'}")
    print(f"  Files: {list(result.files.keys())}")
    print(f"  OpenCode elapsed: {result.elapsed_sec:.1f}s")

    if not result.success:
        print(f"\n  [FAILED] Beast mode failed: {result.error}")
        print(f"  Log (last 1000 chars):\n{result.opencode_log[-1000:]}")
        # Write log for debugging
        (stage_dir / "opencode_log.txt").write_text(
            result.opencode_log, encoding="utf-8",
        )
        (stage_dir / "beast_mode_log.json").write_text(
            json.dumps({
                "success": False,
                "error": result.error,
                "elapsed_sec": result.elapsed_sec,
            }, indent=2),
            encoding="utf-8",
        )
        sys.exit(1)

    # Write generated files
    exp_dir = stage_dir / "experiment"
    exp_dir.mkdir(parents=True, exist_ok=True)
    for fname, code in result.files.items():
        fpath = exp_dir / fname
        fpath.parent.mkdir(parents=True, exist_ok=True)
        fpath.write_text(code, encoding="utf-8")
    print(f"\n  Files written to: {exp_dir}")

    # Write beast mode log
    (stage_dir / "beast_mode_log.json").write_text(
        json.dumps({
            "success": True,
            "elapsed_sec": result.elapsed_sec,
            "files": list(result.files.keys()),
        }, indent=2),
        encoding="utf-8",
    )

    # Step 5: Quality evaluation
    print("\n[Step 5] Quality evaluation...")
    checks = {
        "main.py exists": "main.py" in result.files,
        "main.py is non-empty": len(result.files.get("main.py", "")) > 100,
        "Has metric print": "test_accuracy" in result.files.get("main.py", ""),
        "Has seed loop": "seed" in result.files.get("main.py", "").lower(),
        "Has CIFAR-10": "cifar" in result.files.get("main.py", "").lower(),
        "Has torch import": "import torch" in result.files.get("main.py", ""),
        "No argparse": "argparse" not in result.files.get("main.py", ""),
        "Has multiple conditions": any(
            kw in result.files.get("main.py", "").lower()
            for kw in ["baseline", "dense", "moe", "expert", "condition"]
        ),
        "Has time guard": any(
            kw in result.files.get("main.py", "")
            for kw in ["time.time", "time.monotonic", "time_budget", "time_limit"]
        ),
    }

    all_pass = True
    for check_name, passed in checks.items():
        status = "PASS" if passed else "FAIL"
        if not passed:
            all_pass = False
        print(f"  [{status}] {check_name}")

    # Count lines of code
    total_loc = sum(len(code.splitlines()) for code in result.files.values())
    py_files = [f for f in result.files if f.endswith(".py")]
    print(f"\n  Total files: {len(result.files)}")
    print(f"  Python files: {len(py_files)}")
    print(f"  Total lines of code: {total_loc}")

    # Try AST parsing main.py
    import ast
    try:
        ast.parse(result.files["main.py"])
        print("  [PASS] main.py AST parse: valid Python")
    except SyntaxError as e:
        print(f"  [FAIL] main.py AST parse error: {e}")
        all_pass = False

    # Print first 50 lines of main.py for manual inspection
    main_lines = result.files.get("main.py", "").splitlines()
    print(f"\n  --- main.py preview (first 50 of {len(main_lines)} lines) ---")
    for i, line in enumerate(main_lines[:50], 1):
        print(f"  {i:4d} | {line}")
    if len(main_lines) > 50:
        print(f"  ... ({len(main_lines) - 50} more lines)")

    # Final verdict
    print("\n" + "=" * 70)
    pass_count = sum(1 for v in checks.values() if v)
    total = len(checks)
    if all_pass:
        print(f"VERDICT: ALL CHECKS PASSED ({pass_count}/{total})")
    else:
        print(f"VERDICT: {pass_count}/{total} checks passed")
    print(f"Stage dir: {stage_dir}")
    print("=" * 70)


if __name__ == "__main__":
    main()


================================================
FILE: scripts/test_code_agent_live.py
================================================
#!/usr/bin/env python3
"""Live test of CodeAgent with real LLM — evaluates code generation quality.

This script directly invokes the CodeAgent with real experiment plans
and evaluates the quality of generated code. No full pipeline needed.

Usage:
    python scripts/test_code_agent_live.py [--model gpt-4.1] [--test-id 1]
"""

from __future__ import annotations

import argparse
import ast
import json
import os
import sys
import time
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from researchclaw.llm.client import LLMClient, LLMConfig
from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig
from researchclaw.prompts import PromptManager

# ---------------------------------------------------------------------------
# Test cases — progressively harder experiment scenarios
# ---------------------------------------------------------------------------

TEST_CASES = {
    1: {
        "name": "Vision Transformer on CIFAR-10",
        "topic": (
            "Comparing Vision Transformer (ViT) variants for image classification: "
            "investigate how patch size, number of attention heads, and positional "
            "encoding strategies affect classification accuracy on CIFAR-10"
        ),
        "exp_plan": """
objectives:
  - Compare ViT-Tiny variants with different patch sizes (4, 8, 16)
  - Evaluate multi-head self-attention with different head counts (2, 4, 8)
  - Compare learnable vs sinusoidal positional encodings
datasets:
  - name: CIFAR-10
    source: torchvision.datasets.CIFAR10
    train_size: 50000
    test_size: 10000
baselines:
  - name: SimpleViT-P16
    description: Standard ViT with patch_size=16, 4 heads, learnable pos encoding
proposed_methods:
  - name: SmallPatch-ViT
    implementation_spec:
      class_name: SmallPatchViT
      key_methods: [forward, _create_patches, _attention]
      differentiator: Uses patch_size=4 for finer-grained spatial features
  - name: ManyHead-ViT
    implementation_spec:
      class_name: ManyHeadViT
      key_methods: [forward, _multi_head_attention]
      differentiator: Uses 8 attention heads instead of 4
ablations:
  - name: SinusoidalPos-ViT
    description: Replace learnable positional encoding with sinusoidal
metrics:
  - accuracy (higher is better)
  - training_loss
compute_budget:
  time_limit_sec: 300
  epochs: 10
""",
        "metric": "accuracy",
        "min_files": 2,
        "min_classes": 3,
        "required_imports": ["torch", "torchvision"],
    },
    2: {
        "name": "Distribution Shift Detection via Uncertainty",
        "topic": (
            "Detecting distribution shift in deployed ML models using "
            "uncertainty estimation: comparing Monte Carlo Dropout, "
            "Deep Ensembles, and Spectral-Normalized Neural GP (SNGP) "
            "for out-of-distribution detection on corrupted CIFAR-10"
        ),
        "exp_plan": """
objectives:
  - Implement 3 uncertainty estimation methods for OOD detection
  - Evaluate on CIFAR-10 vs CIFAR-10-C (corrupted) as OOD
  - Compare AUROC for separating in-distribution from OOD samples
datasets:
  - name: CIFAR-10
    source: torchvision.datasets.CIFAR10
    role: in-distribution
  - name: CIFAR-10-C
    source: Generated via Gaussian noise corruption
    role: out-of-distribution
baselines:
  - name: MCDropout
    description: Monte Carlo Dropout with 30 forward passes, mean+std of softmax
    implementation_spec:
      class_name: MCDropoutDetector
      key_methods: [predict_with_uncertainty, _mc_forward, compute_auroc]
      differentiator: Standard MC Dropout baseline
proposed_methods:
  - name: DeepEnsemble
    implementation_spec:
      class_name: DeepEnsembleDetector
      key_methods: [train_ensemble, predict_with_uncertainty, _member_forward]
      differentiator: Trains 3 independent models, uses prediction disagreement
  - name: SNGP
    implementation_spec:
      class_name: SNGPDetector
      key_methods: [forward, _spectral_norm_layer, _gp_output_layer]
      differentiator: Spectral normalization + GP output layer for distance-aware uncertainty
ablations:
  - name: MCDropout-10passes
    description: MC Dropout with only 10 forward passes (reduced compute)
metrics:
  - auroc (higher is better)
  - ece (expected calibration error, lower is better)
compute_budget:
  time_limit_sec: 300
  epochs: 5
""",
        "metric": "auroc",
        "min_files": 2,
        "min_classes": 4,
        "required_imports": ["torch", "numpy"],
    },
    3: {
        "name": "Meta-Learning Few-Shot with MAML",
        "topic": (
            "Few-shot learning with gradient-based meta-learning: comparing "
            "MAML, Reptile, and Prototypical Networks on Omniglot-style "
            "synthetic tasks with 5-way 1-shot and 5-way 5-shot settings"
        ),
        "exp_plan": """
objectives:
  - Implement 3 few-shot learning algorithms from scratch
  - Evaluate on synthetic few-shot tasks (5-way, 1-shot and 5-shot)
  - Compare accuracy and convergence speed
datasets:
  - name: SyntheticFewShot
    source: Generated in-code (random linear classification tasks)
    n_classes: 20
    samples_per_class: 20
baselines:
  - name: ProtoNet
    description: Prototypical Networks — learn embedding, classify by nearest class prototype
    implementation_spec:
      class_name: PrototypicalNetwork
      key_methods: [embed, compute_prototypes, classify, meta_train_step]
      differentiator: Non-gradient meta-learning baseline using metric space
proposed_methods:
  - name: MAML
    implementation_spec:
      class_name: MAMLLearner
      key_methods: [inner_loop, outer_loop, meta_train_step, adapt]
      differentiator: Second-order gradient-based meta-learning with inner loop adaptation
  - name: Reptile
    implementation_spec:
      class_name: ReptileLearner
      key_methods: [inner_loop, meta_update, meta_train_step]
      differentiator: First-order approximation — SGD on tasks, move toward task-optimal weights
ablations:
  - name: MAML-FirstOrder
    description: MAML with first-order approximation (no second derivatives)
metrics:
  - accuracy (higher is better)
  - meta_train_loss
compute_budget:
  time_limit_sec: 300
  meta_epochs: 200
  inner_steps: 5
  inner_lr: 0.01
""",
        "metric": "accuracy",
        "min_files": 2,
        "min_classes": 3,
        "required_imports": ["torch"],
    },
}


# ---------------------------------------------------------------------------
# Code quality analysis
# ---------------------------------------------------------------------------


def analyze_code_quality(files: dict[str, str], test_case: dict) -> dict:
    """Analyze the quality of generated code."""
    report = {
        "test_name": test_case["name"],
        "num_files": len(files),
        "file_names": list(files.keys()),
        "total_lines": 0,
        "effective_lines": 0,
        "classes_found": [],
        "functions_found": [],
        "imports_found": [],
        "issues": [],
        "scores": {},
    }

    all_code = ""
    for fname, code in files.items():
        all_code += code + "\n"
        lines = code.split("\n")
        report["total_lines"] += len(lines)
        effective = [
            l for l in lines
            if l.strip() and not l.strip().startswith("#") and not l.strip().startswith("import") and not l.strip().startswith("from")
        ]
        report["effective_lines"] += len(effective)

        # AST analysis
        try:
            tree = ast.parse(code)
            for node in ast.walk(tree):
                if isinstance(node, ast.ClassDef):
                    methods = [
                        n.name for n in node.body
                        if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
                    ]
                    method_lines = sum(
                        n.end_lineno - n.lineno + 1
                        for n in node.body
                        if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
                        and n.end_lineno
                    )
                    report["classes_found"].append({
                        "name": node.name,
                        "file": fname,
                        "methods": methods,
                        "method_count": len(methods),
                        "total_method_lines": method_lines,
                    })
                elif isinstance(node, ast.FunctionDef) and node.col_offset == 0:
                    report["functions_found"].append({
                        "name": node.name,
                        "file": fname,
                        "lines": (node.end_lineno or node.lineno) - node.lineno + 1,
                    })
                elif isinstance(node, (ast.Import, ast.ImportFrom)):
                    if isinstance(node, ast.Import):
                        for alias in node.names:
                            report["imports_found"].append(alias.name.split(".")[0])
                    else:
                        if node.module:
                            report["imports_found"].append(node.module.split(".")[0])
        except SyntaxError as e:
            report["issues"].append(f"SyntaxError in {fname}: {e}")

    report["imports_found"] = sorted(set(report["imports_found"]))

    # Scoring
    # 1. File count (target: min_files)
    file_score = min(10, (len(files) / test_case["min_files"]) * 10)
    report["scores"]["file_structure"] = round(file_score, 1)

    # 2. Class count (target: min_classes)
    class_score = min(10, (len(report["classes_found"]) / test_case["min_classes"]) * 10)
    report["scores"]["class_coverage"] = round(class_score, 1)

    # 3. Code depth (effective lines)
    depth_score = min(10, report["effective_lines"] / 30)  # 300 lines = 10
    report["scores"]["code_depth"] = round(depth_score, 1)

    # 4. Method richness (average methods per class)
    if report["classes_found"]:
        avg_methods = sum(c["method_count"] for c in report["classes_found"]) / len(report["classes_found"])
        method_score = min(10, avg_methods / 0.5)  # 5 methods/class = 10
        report["scores"]["method_richness"] = round(method_score, 1)
    else:
        report["scores"]["method_richness"] = 0

    # 5. Import coverage
    required = set(test_case.get("required_imports", []))
    found = set(report["imports_found"])
    if required:
        import_score = len(required & found) / len(required) * 10
    else:
        import_score = 10
    report["scores"]["import_coverage"] = round(import_score, 1)

    # 6. Syntax validity
    syntax_score = 10 if not any("SyntaxError" in i for i in report["issues"]) else 0
    report["scores"]["syntax_valid"] = syntax_score

    # Overall score
    scores = report["scores"]
    report["overall_score"] = round(
        sum(scores.values()) / len(scores), 1
    )

    # Quality checks
    if len(files) < test_case["min_files"]:
        report["issues"].append(
            f"Too few files: {len(files)} < {test_case['min_files']}"
        )
    if len(report["classes_found"]) < test_case["min_classes"]:
        report["issues"].append(
            f"Too few classes: {len(report['classes_found'])} < {test_case['min_classes']}"
        )
    for cls in report["classes_found"]:
        if cls["total_method_lines"] < 10:
            report["issues"].append(
                f"Class {cls['name']} has only {cls['total_method_lines']} method lines (too thin)"
            )

    return report


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main():
    parser = argparse.ArgumentParser(description="Live test CodeAgent quality")
    parser.add_argument("--model", default="gpt-4.1", help="Model to use")
    parser.add_argument("--test-id", type=int, default=0, help="Test case ID (0=all)")
    parser.add_argument("--no-sandbox", action="store_true", help="Skip sandbox exec-fix")
    parser.add_argument("--tree-search", action="store_true", help="Enable tree search")
    parser.add_argument("--output-dir", default="test_outputs", help="Output directory")
    args = parser.parse_args()

    # Setup LLM client
    base_url = os.environ.get("OPENAI_BASE_URL", "")
    api_key = os.environ.get("OPENAI_API_KEY", "")
    if not base_url or not api_key:
        print("ERROR: Set OPENAI_BASE_URL and OPENAI_API_KEY environment variables")
        sys.exit(1)

    llm_config = LLMConfig(
        base_url=base_url,
        api_key=api_key,
        primary_model=args.model,
        fallback_models=[],
        max_tokens=16384,
        temperature=0.7,
        timeout_sec=300,
    )
    llm = LLMClient(llm_config)

    # Quick connectivity test
    print(f"Testing LLM connectivity ({args.model})... ", end="", flush=True)
    ok, msg = llm.preflight()
    if not ok:
        print(f"FAILED: {msg}")
        sys.exit(1)
    print("OK")

    pm = PromptManager()

    # Select test cases
    if args.test_id > 0:
        if args.test_id not in TEST_CASES:
            print(f"ERROR: Unknown test ID {args.test_id}. Available: {list(TEST_CASES.keys())}")
            sys.exit(1)
        cases = {args.test_id: TEST_CASES[args.test_id]}
    else:
        cases = TEST_CASES

    # Output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    all_reports = []

    for test_id, tc in cases.items():
        print(f"\n{'='*60}")
        print(f"Test {test_id}: {tc['name']}")
        print(f"{'='*60}")

        stage_dir = output_dir / f"test_{test_id}"
        stage_dir.mkdir(parents=True, exist_ok=True)

        config = CodeAgentConfig(
            architecture_planning=True,
            exec_fix_max_iterations=0 if args.no_sandbox else 3,
            tree_search_enabled=args.tree_search,
            review_max_rounds=2,
        )

        agent = CodeAgent(
            llm=llm,
            prompts=pm,
            config=config,
            stage_dir=stage_dir,
            sandbox_factory=None,  # No sandbox for quick test
        )

        t0 = time.time()
        result = agent.generate(
            topic=tc["topic"],
            exp_plan=tc["exp_plan"],
            metric=tc["metric"],
            pkg_hint=(
                "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, "
                "torch, torchvision, sklearn, scipy, pandas, matplotlib.\n"
                "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). "
                "Use `device = torch.device('cuda')` for tensor operations.\n"
            ),
            max_tokens=16384,
        )
        elapsed = time.time() - t0

        print(f"\nGeneration time: {elapsed:.1f}s")
        print(f"LLM calls: {result.total_llm_calls}")
        print(f"Review rounds: {result.review_rounds}")
        print(f"Architecture spec: {len(result.architecture_spec)} chars")

        # Write generated files
        for fname, code in result.files.items():
            fpath = stage_dir / fname
            fpath.parent.mkdir(parents=True, exist_ok=True)
            fpath.write_text(code, encoding="utf-8")
            lines = len(code.split("\n"))
            print(f"  {fname}: {lines} lines")

        # Write architecture spec
        if result.architecture_spec:
            (stage_dir / "architecture_spec.yaml").write_text(
                result.architecture_spec, encoding="utf-8"
            )

        # Analyze quality
        report = analyze_code_quality(result.files, tc)
        report["generation_time_sec"] = round(elapsed, 1)
        report["llm_calls"] = result.total_llm_calls
        report["review_rounds"] = result.review_rounds
        report["architecture_spec_chars"] = len(result.architecture_spec)

        # Print report
        print(f"\n--- Quality Report ---")
        print(f"Files: {report['num_files']}")
        print(f"Total lines: {report['total_lines']}")
        print(f"Effective lines: {report['effective_lines']}")
        print(f"Classes: {len(report['classes_found'])}")
        for cls in report["classes_found"]:
            print(f"  - {cls['name']} ({cls['method_count']} methods, {cls['total_method_lines']} lines)")
        print(f"Imports: {', '.join(report['imports_found'])}")
        print(f"\nScores:")
        for k, v in report["scores"].items():
            print(f"  {k}: {v}/10")
        print(f"  OVERALL: {report['overall_score']}/10")
        if report["issues"]:
            print(f"\nIssues:")
            for issue in report["issues"]:
                print(f"  - {issue}")

        # Save report
        (stage_dir / "quality_report.json").write_text(
            json.dumps(report, indent=2), encoding="utf-8"
        )
        all_reports.append(report)

    # Summary
    if len(all_reports) > 1:
        print(f"\n{'='*60}")
        print("SUMMARY")
        print(f"{'='*60}")
        for r in all_reports:
            print(f"  {r['test_name']}: {r['overall_score']}/10 "
                  f"({r['effective_lines']} lines, {len(r['classes_found'])} classes)")
        avg = sum(r["overall_score"] for r in all_reports) / len(all_reports)
        print(f"\n  Average: {avg:.1f}/10")

    # Save all reports
    (output_dir / "all_reports.json").write_text(
        json.dumps(all_reports, indent=2), encoding="utf-8"
    )
    print(f"\nAll outputs saved to: {output_dir}/")


if __name__ == "__main__":
    main()


================================================
FILE: scripts/test_code_agent_sandbox.py
================================================
#!/usr/bin/env python3
"""Test CodeAgent with Docker sandbox exec-fix loop.

Generates code with Phase 1-4 (architecture, exec-fix, review),
runs in Docker sandbox, verifies the exec-fix loop catches and fixes errors.

Usage:
    python scripts/test_code_agent_sandbox.py [--model gpt-5.1] [--test-id 1]
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from researchclaw.config import DockerSandboxConfig, ExperimentConfig
from researchclaw.experiment.docker_sandbox import DockerSandbox
from researchclaw.llm.client import LLMClient, LLMConfig
from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig
from researchclaw.prompts import PromptManager

# ---------------------------------------------------------------------------
# Test case (simple — should run quickly in sandbox)
# ---------------------------------------------------------------------------

TEST_CASES = {
    1: {
        "name": "ViT on CIFAR-10 (sandbox)",
        "topic": (
            "Comparing Vision Transformer (ViT) variants for image classification: "
            "investigate how patch size and number of attention heads affect "
            "classification accuracy on CIFAR-10"
        ),
        "exp_plan": """
objectives:
  - Compare ViT-Tiny variants with different patch sizes (4, 16)
  - Evaluate multi-head self-attention with different head counts (4, 8)
datasets:
  - name: CIFAR-10
    source: torchvision.datasets.CIFAR10
    train_size: 50000
    test_size: 10000
baselines:
  - name: SimpleViT-P16
    description: Standard ViT with patch_size=16, 4 heads, learnable pos encoding
proposed_methods:
  - name: SmallPatch-ViT
    implementation_spec:
      class_name: SmallPatchViT
      key_methods: [forward, _create_patches, _attention]
      differentiator: Uses patch_size=4 for finer-grained spatial features
  - name: ManyHead-ViT
    implementation_spec:
      class_name: ManyHeadViT
      key_methods: [forward, _multi_head_attention]
      differentiator: Uses 8 attention heads instead of 4
ablations:
  - name: SinusoidalPos-ViT
    description: Replace learnable positional encoding with sinusoidal
metrics:
  - accuracy (higher is better)
  - training_loss
compute_budget:
  time_limit_sec: 120
  epochs: 3
""",
        "metric": "accuracy",
    },
    2: {
        "name": "OOD Detection (sandbox)",
        "topic": (
            "Detecting distribution shift using uncertainty estimation: "
            "comparing Monte Carlo Dropout and Deep Ensembles "
            "for out-of-distribution detection on corrupted CIFAR-10"
        ),
        "exp_plan": """
objectives:
  - Implement 2 uncertainty estimation methods for OOD detection
  - Evaluate on CIFAR-10 vs Gaussian noise corruption as OOD
  - Compare AUROC for separating in-distribution from OOD samples
datasets:
  - name: CIFAR-10
    source: torchvision.datasets.CIFAR10
    role: in-distribution
  - name: CIFAR-10-C
    source: Generated via Gaussian noise corruption
    role: out-of-distribution
baselines:
  - name: MCDropout
    description: Monte Carlo Dropout with 20 forward passes
    implementation_spec:
      class_name: MCDropoutDetector
      key_methods: [predict_with_uncertainty, _mc_forward, compute_auroc]
proposed_methods:
  - name: DeepEnsemble
    implementation_spec:
      class_name: DeepEnsembleDetector
      key_methods: [train_ensemble, predict_with_uncertainty]
      differentiator: Trains 3 independent models, uses prediction disagreement
ablations:
  - name: MCDropout-5passes
    description: MC Dropout with only 5 forward passes
metrics:
  - auroc (higher is better)
compute_budget:
  time_limit_sec: 120
  epochs: 3
""",
        "metric": "auroc",
    },
}


def make_sandbox_factory(docker_cfg: DockerSandboxConfig):
    """Return a factory function that creates DockerSandbox instances."""
    def factory(exp_config, workdir: Path):
        return DockerSandbox(docker_cfg, workdir)
    return factory


def main():
    parser = argparse.ArgumentParser(description="Test CodeAgent with Docker sandbox")
    parser.add_argument("--model", default="gpt-5.1", help="Model to use")
    parser.add_argument("--test-id", type=int, default=1, help="Test case ID")
    parser.add_argument("--output-dir", default="test_outputs_sandbox", help="Output dir")
    parser.add_argument("--exec-fix-iters", type=int, default=3, help="Max exec-fix iterations")
    parser.add_argument("--timeout", type=int, default=180, help="Sandbox timeout (sec)")
    args = parser.parse_args()

    # Setup LLM
    base_url = os.environ.get("OPENAI_BASE_URL", "")
    api_key = os.environ.get("OPENAI_API_KEY", "")
    if not base_url or not api_key:
        print("ERROR: Set OPENAI_BASE_URL and OPENAI_API_KEY")
        sys.exit(1)

    llm_config = LLMConfig(
        base_url=base_url,
        api_key=api_key,
        primary_model=args.model,
        fallback_models=[],
        max_tokens=16384,
        temperature=0.7,
        timeout_sec=300,
    )
    llm = LLMClient(llm_config)

    print(f"Testing LLM connectivity ({args.model})... ", end="", flush=True)
    ok, msg = llm.preflight()
    if not ok:
        print(f"FAILED: {msg}")
        sys.exit(1)
    print("OK")

    # Docker sandbox setup
    docker_cfg = DockerSandboxConfig(
        image="researchclaw/experiment:latest",
        gpu_enabled=True,
        memory_limit_mb=16384,
        network_policy="setup_only",
    )

    if not DockerSandbox.check_docker_available():
        print("ERROR: Docker not available")
        sys.exit(1)
    if not DockerSandbox.ensure_image(docker_cfg.image):
        print(f"ERROR: Docker image {docker_cfg.image} not found")
        sys.exit(1)
    print(f"Docker sandbox ready: {docker_cfg.image}")

    # Select test case
    tc = TEST_CASES.get(args.test_id)
    if not tc:
        print(f"ERROR: Unknown test ID {args.test_id}")
        sys.exit(1)

    pm = PromptManager()
    output_dir = Path(args.output_dir)
    stage_dir = output_dir / f"test_{args.test_id}"
    stage_dir.mkdir(parents=True, exist_ok=True)

    # CodeAgent with sandbox enabled
    config = CodeAgentConfig(
        architecture_planning=True,
        exec_fix_max_iterations=args.exec_fix_iters,
        exec_fix_timeout_sec=args.timeout,
        tree_search_enabled=False,
        review_max_rounds=2,
    )

    sandbox_factory = make_sandbox_factory(docker_cfg)

    agent = CodeAgent(
        llm=llm,
        prompts=pm,
        config=config,
        stage_dir=stage_dir,
        sandbox_factory=sandbox_factory,
    )

    print(f"\n{'='*60}")
    print(f"Test {args.test_id}: {tc['name']}")
    print(f"  exec_fix_max_iterations: {args.exec_fix_iters}")
    print(f"  sandbox_timeout: {args.timeout}s")
    print(f"{'='*60}")

    t0 = time.time()
    result = agent.generate(
        topic=tc["topic"],
        exp_plan=tc["exp_plan"],
        metric=tc["metric"],
        pkg_hint=(
            "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, "
            "torch, torchvision, sklearn, scipy, pandas, matplotlib, "
            "tqdm, timm, einops, torchmetrics, gymnasium, networkx.\n"
            "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). "
            "Use `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')` "
            "for tensor operations.\n"
            "DATA PATH: CIFAR-10 is pre-cached at /opt/datasets/cifar-10-batches-py/. "
            "Use `torchvision.datasets.CIFAR10(root='/opt/datasets', download=False)`.\n"
        ),
        max_tokens=16384,
    )
    elapsed = time.time() - t0

    # Report
    print(f"\n--- Generation Report ---")
    print(f"Time: {elapsed:.1f}s")
    print(f"LLM calls: {result.total_llm_calls}")
    print(f"Sandbox runs: {result.total_sandbox_runs}")
    print(f"Review rounds: {result.review_rounds}")
    print(f"Best score: {result.best_score}")

    # Write files
    for fname, code in result.files.items():
        fpath = stage_dir / fname
        fpath.parent.mkdir(parents=True, exist_ok=True)
        fpath.write_text(code, encoding="utf-8")
        lines = len(code.split("\n"))
        print(f"  {fname}: {lines} lines")

    # Write arch spec
    if result.architecture_spec:
        (stage_dir / "architecture_spec.yaml").write_text(
            result.architecture_spec, encoding="utf-8"
        )

    # Write validation log
    (stage_dir / "validation_log.json").write_text(
        json.dumps({
            "log": result.validation_log,
            "total_llm_calls": result.total_llm_calls,
            "total_sandbox_runs": result.total_sandbox_runs,
            "review_rounds": result.review_rounds,
            "best_score": result.best_score,
            "elapsed_sec": round(elapsed, 1),
        }, indent=2),
        encoding="utf-8",
    )

    # Final sandbox run for end-to-end verification
    print(f"\n--- Final sandbox verification ---")
    workdir = stage_dir / "_final_run"
    workdir.mkdir(parents=True, exist_ok=True)
    sandbox = DockerSandbox(docker_cfg, workdir)
    final_result = sandbox.run_project(
        stage_dir, entry_point="main.py", timeout_sec=args.timeout,
    )
    print(f"Return code: {final_result.returncode}")
    print(f"Elapsed: {final_result.elapsed_sec:.1f}s")
    print(f"Timed out: {final_result.timed_out}")
    if final_result.metrics:
        print(f"Metrics: {json.dumps(dict(final_result.metrics), indent=2)}")
    if final_result.returncode != 0:
        print(f"STDERR (last 500):\n{final_result.stderr[-500:]}")
    else:
        print("SUCCESS: Code runs to completion in Docker sandbox!")
        stdout_lines = final_result.stdout.strip().split("\n")
        print(f"STDOUT (last 10 lines):")
        for line in stdout_lines[-10:]:
            print(f"  {line}")

    # Save final run results
    (stage_dir / "final_run_result.json").write_text(
        json.dumps({
            "returncode": final_result.returncode,
            "elapsed_sec": final_result.elapsed_sec,
            "timed_out": final_result.timed_out,
            "metrics": dict(final_result.metrics) if final_result.metrics else {},
            "stdout_tail": "\n".join(stdout_lines[-20:]) if final_result.returncode == 0 else "",
            "stderr_tail": final_result.stderr[-1000:] if final_result.returncode != 0 else "",
        }, indent=2),
        encoding="utf-8",
    )


if __name__ == "__main__":
    main()


================================================
FILE: scripts/test_codegen_v2.py
================================================
#!/usr/bin/env python3
"""Enhanced code generation test — generates code and runs in Docker sandbox.

Tests the full code generation pipeline in isolation:
  1. Load experiment plan (from previous run or built-in test case)
  2. Generate code via CodeAgent
  3. Validate generated code (AST, security, quality)
  4. Run in Docker sandbox
  5. Score results comprehensively

Usage:
    # Run with built-in test case
    python scripts/test_codegen_v2.py --test-id 1

    # Run with real experiment plan from a previous run
    python scripts/test_codegen_v2.py --from-run output/run20

    # Run all built-in test cases
    python scripts/test_codegen_v2.py --test-id 0

    # Skip sandbox (only test generation quality)
    python scripts/test_codegen_v2.py --test-id 1 --no-sandbox
"""

from __future__ import annotations

import argparse
import ast
import json
import os
import re
import sys
import time
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from researchclaw.llm.client import LLMClient, LLMConfig
from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig
from researchclaw.prompts import PromptManager

# ---------------------------------------------------------------------------
# Built-in test cases
# ---------------------------------------------------------------------------

TEST_CASES = {
    1: {
        "name": "KD for Compact ViTs (CIFAR-10)",
        "topic": (
            "Knowledge Distillation for Compact Vision Transformers: "
            "Attention-Guided Feature Alignment on CIFAR-10"
        ),
        "exp_plan": """
topic: "Knowledge Distillation for Compact Vision Transformers"
datasets:
  - name: CIFAR-10
    source: torchvision.datasets.CIFAR10
    path: /opt/datasets/cifar10
baselines:
  - name: TeacherResNet18
    description: Pre-trained ResNet-18 teacher model (frozen)
    implementation_spec:
      class_name: TeacherResNet18
      key_methods: [__init__, forward]
      algorithm_steps:
        - Load pre-trained ResNet-18 from torchvision
        - Freeze all parameters
        - Use as teacher for distillation
  - name: StudentViT_Baseline
    description: Compact ViT trained with standard cross-entropy (no KD)
    implementation_spec:
      class_name: StudentViTBaseline
      key_methods: [__init__, forward, train_epoch, evaluate]
      algorithm_steps:
        - Compact ViT with patch_size=4, dim=128, depth=4, heads=4
        - Train with cross-entropy loss only
        - Standard SGD optimizer with cosine LR schedule
      loss_function: "L = CrossEntropy(student_logits, labels)"
      key_hyperparameters:
        lr: 0.01
        batch_size: 128
        epochs: 20
proposed_methods:
  - name: AttentionGuidedKD
    description: Knowledge distillation with attention-guided feature alignment
    aligns_hypothesis: H1
    implementation_spec:
      class_name: AttentionGuidedKDStudent
      key_methods: [__init__, forward, compute_kd_loss, compute_attention_loss, train_epoch]
      algorithm_steps:
        - Same compact ViT architecture as baseline
        - KD loss with temperature T=4
        - Attention transfer loss between teacher and student attention maps
        - Combined loss = alpha * KD_loss + beta * attention_loss + (1-alpha-beta) * CE_loss
      loss_function: "L = 0.5*KLDiv(s/T, t/T)*T^2 + 0.3*MSE(student_attn, teacher_attn) + 0.2*CE(s, y)"
      key_hyperparameters:
        temperature: 4
        alpha: 0.5
        beta: 0.3
        lr: 0.01
      differentiator: Uses attention map alignment between teacher and student
ablations:
  - name: KD_NoAttentionTransfer
    based_on: AttentionGuidedKD
    what_is_removed: Attention transfer loss (beta=0)
    how_it_differs: Only uses KD loss + CE loss, no attention alignment
    expected_effect: Lower accuracy due to missing attention guidance
  - name: KD_ReducedCapacity
    based_on: AttentionGuidedKD
    what_is_removed: Half the model capacity (dim=64, depth=2, heads=2)
    how_it_differs: Smaller ViT architecture, same training procedure
    expected_effect: Lower accuracy due to reduced model capacity
metrics:
  primary_metric:
    name: primary_metric
    direction: maximize
    description: Top-1 accuracy on CIFAR-10 test set
compute_budget:
  total_time_seconds: 300
  conditions: [TeacherResNet18, StudentViT_Baseline, AttentionGuidedKD, KD_NoAttentionTransfer, KD_ReducedCapacity]
""",
        "metric": "primary_metric",
        "metric_direction": "maximize",
    },
    2: {
        "name": "PPO with Curiosity Reward (Gymnasium)",
        "topic": (
            "Agent-Centric Reinforcement Learning with Adaptive Reward "
            "Decomposition for CartPole and LunarLander"
        ),
        "exp_plan": """
topic: "Agent-Centric RL with Adaptive Reward Decomposition"
datasets:
  - name: CartPole-v1
    source: gymnasium
  - name: LunarLander-v3
    source: gymnasium
baselines:
  - name: VanillaPPO
    description: Standard PPO with clipped surrogate objective
    implementation_spec:
      class_name: VanillaPPO
      key_methods: [__init__, select_action, update, train_episode]
      algorithm_steps:
        - Policy network (2-layer MLP, 64 hidden)
        - Value network (separate 2-layer MLP)
        - Clipped surrogate objective with epsilon=0.2
        - GAE lambda=0.95 for advantage estimation
      loss_function: "L_policy = -min(r*A, clip(r,1-eps,1+eps)*A); L_value = MSE(V, R)"
      key_hyperparameters:
        lr: 3e-4
        gamma: 0.99
        clip_eps: 0.2
        gae_lambda: 0.95
      differentiator: Standard PPO baseline
proposed_methods:
  - name: CuriosityPPO
    description: PPO with intrinsic curiosity module
    implementation_spec:
      class_name: CuriosityPPO
      key_methods: [__init__, select_action, compute_intrinsic_reward, update, train_episode]
      algorithm_steps:
        - Same PPO base as VanillaPPO
        - Forward dynamics model predicts next state from (state, action)
        - Intrinsic reward = prediction error of forward model
        - Total reward = extrinsic + eta * intrinsic
      loss_function: "L = L_ppo + L_forward_model; r_total = r_ext + eta * ||f(s,a) - s'||^2"
      key_hyperparameters:
        eta: 0.1
        forward_model_lr: 1e-3
      differentiator: Adds intrinsic curiosity-driven exploration reward
ablations:
  - name: PPO_NoCuriosity
    based_on: CuriosityPPO
    what_is_removed: Intrinsic reward (eta=0, forward model not used)
    how_it_differs: Same architecture but intrinsic reward zeroed out
    expected_effect: Should match VanillaPPO performance
  - name: PPO_ReducedNetwork
    based_on: VanillaPPO
    what_is_removed: Half network capacity (32 hidden units)
    how_it_differs: Smaller policy and value networks
    expected_effect: Lower performance due to limited capacity
metrics:
  primary_metric:
    name: primary_metric
    direction: maximize
    description: Average episodic reward over last 10 episodes
compute_budget:
  total_time_seconds: 300
  conditions: [VanillaPPO, CuriosityPPO, PPO_NoCuriosity, PPO_ReducedNetwork]
""",
        "metric": "primary_metric",
        "metric_direction": "maximize",
    },
    3: {
        "name": "Graph Neural ODE (Synthetic)",
        "topic": (
            "Graph Neural Ordinary Differential Equations for Dynamic System "
            "Modeling on Synthetic Coupled Oscillator Networks"
        ),
        "exp_plan": """
topic: "Graph Neural ODE for Dynamic System Modeling"
datasets:
  - name: SyntheticOscillators
    source: Generated in-code
    description: Coupled spring-mass system on a random graph
baselines:
  - name: StaticGCN
    description: Standard GCN applied at discrete time steps
    implementation_spec:
      class_name: StaticGCN
      key_methods: [__init__, forward, predict_trajectory]
      algorithm_steps:
        - 2-layer GCN with message passing
        - Discrete time step predictions
        - MSE loss on next-step prediction
      loss_function: "L = MSE(pred_next, true_next)"
      key_hyperparameters:
        hidden_dim: 64
        num_layers: 2
        lr: 1e-3
proposed_methods:
  - name: GraphNeuralODE
    description: Continuous-time dynamics via Neural ODE on graph
    implementation_spec:
      class_name: GraphNeuralODE
      key_methods: [__init__, forward, ode_func, predict_trajectory]
      algorithm_steps:
        - GNN-based ODE function f(t, x, A) that defines dx/dt
        - Neural ODE solver (torchdiffeq.odeint) for continuous trajectory
        - MSE loss on trajectory prediction at observed time points
      loss_function: "L = MSE(odeint(f, x0, t), x_true)"
      key_hyperparameters:
        hidden_dim: 64
        solver: dopri5
        lr: 1e-3
      differentiator: Continuous-time dynamics via ODE solver
ablations:
  - name: GraphODE_NoMessagePassing
    based_on: GraphNeuralODE
    what_is_removed: Graph structure (treats nodes independently)
    how_it_differs: ODE function ignores adjacency, no message passing
    expected_effect: Worse prediction on coupled systems
  - name: GraphODE_EulerSolver
    based_on: GraphNeuralODE
    what_is_removed: Adaptive ODE solver (uses fixed-step Euler)
    how_it_differs: Simple Euler integration instead of dopri5
    expected_effect: Less accurate trajectories
metrics:
  primary_metric:
    name: primary_metric
    direction: minimize
    description: MSE between predicted and true trajectories
compute_budget:
  total_time_seconds: 300
  conditions: [StaticGCN, GraphNeuralODE, GraphODE_NoMessagePassing, GraphODE_EulerSolver]
""",
        "metric": "primary_metric",
        "metric_direction": "minimize",
    },
}


# ---------------------------------------------------------------------------
# Code quality analysis (comprehensive)
# ---------------------------------------------------------------------------

def analyze_code_quality(files: dict[str, str], test_case: dict) -> dict:
    """Comprehensive code quality analysis."""
    report = {
        "test_name": test_case["name"],
        "num_files": len(files),
        "file_names": list(files.keys()),
        "total_lines": 0,
        "effective_lines": 0,
        "classes_found": [],
        "functions_found": [],
        "imports_found": [],
        "issues": [],
        "scores": {},
    }

    for fname, code in files.items():
        lines = code.split("\n")
        report["total_lines"] += len(lines)
        effective = [
            l for l in lines
            if l.strip()
            and not l.strip().startswith("#")
            and not l.strip().startswith('"""')
            and not l.strip().startswith("'''")
        ]
        report["effective_lines"] += len(effective)

        # AST analysis
        try:
            tree = ast.parse(code)
            for node in ast.walk(tree):
                if isinstance(node, ast.ClassDef):
                    methods = [
                        n.name for n in node.body
                        if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
                    ]
                    method_lines = sum(
                        (n.end_lineno or n.lineno) - n.lineno + 1
                        for n in node.body
                        if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
                    )
                    # Check for empty methods
                    empty_methods = []
                    for n in node.body:
                        if isinstance(n, ast.FunctionDef):
                            body_stmts = [
                                s for s in n.body
                                if not isinstance(s, (ast.Pass, ast.Expr))
                                or (isinstance(s, ast.Expr)
                                    and not isinstance(s.value, (ast.Constant, ast.Str)))
                            ]
                            if len(body_stmts) <= 1:
                                empty_methods.append(n.name)

                    report["classes_found"].append({
                        "name": node.name,
                        "file": fname,
                        "methods": methods,
                        "method_count": len(methods),
                        "total_method_lines": method_lines,
                        "bases": [ast.unparse(b) for b in node.bases],
                        "empty_methods": empty_methods,
                    })
                elif isinstance(node, ast.FunctionDef) and node.col_offset == 0:
                    report["functions_found"].append({
                        "name": node.name,
                        "file": fname,
                        "lines": (node.end_lineno or node.lineno) - node.lineno + 1,
                    })
                elif isinstance(node, (ast.Import, ast.ImportFrom)):
                    if isinstance(node, ast.Import):
                        for alias in node.names:
                            report["imports_found"].append(alias.name.split(".")[0])
                    elif node.module:
                        report["imports_found"].append(node.module.split(".")[0])
        except SyntaxError as e:
            report["issues"].append(f"CRITICAL: SyntaxError in {fname}: {e}")

    report["imports_found"] = sorted(set(report["imports_found"]))

    # ---- Scoring ----

    # 1. Syntax validity (0 or 10)
    syntax_ok = not any("SyntaxError" in i for i in report["issues"])
    report["scores"]["syntax_valid"] = 10 if syntax_ok else 0

    # 2. File structure
    file_score = min(10, len(files) * 5)  # 2+ files = 10
    report["scores"]["file_structure"] = round(file_score, 1)

    # 3. Class coverage
    n_classes = len(report["classes_found"])
    class_score = min(10, n_classes * 2.5)  # 4+ classes = 10
    report["scores"]["class_coverage"] = round(class_score, 1)

    # 4. Code depth
    depth_score = min(10, report["effective_lines"] / 40)  # 400+ = 10
    report["scores"]["code_depth"] = round(depth_score, 1)

    # 5. Method richness
    if report["classes_found"]:
        avg_methods = sum(c["method_count"] for c in report["classes_found"]) / n_classes
        method_score = min(10, avg_methods * 2)  # 5+ methods = 10
    else:
        method_score = 0
    report["scores"]["method_richness"] = round(method_score, 1)

    # 6. Class distinctness (check for identical/empty classes)
    empty_class_count = sum(
        1 for c in report["classes_found"]
        if c["total_method_lines"] < 5
    )
    identical_pairs = _check_identical_classes(files)
    distinctness = 10
    if empty_class_count > 0:
        distinctness -= empty_class_count * 3
        report["issues"].append(
            f"WARNING: {empty_class_count} classes have <5 method lines (too thin)"
        )
    if identical_pairs:
        distinctness -= len(identical_pairs) * 4
        for p in identical_pairs:
            report["issues"].append(f"WARNING: Identical classes: {p}")
    report["scores"]["class_distinctness"] = max(0, round(distinctness, 1))

    # 7. Import appropriateness
    has_torch = "torch" in report["imports_found"]
    has_numpy = "numpy" in report["imports_found"]
    import_score = 5  # base
    if has_torch:
        import_score += 3
    if has_numpy:
        import_score += 2
    report["scores"]["imports"] = min(10, import_score)

    # Overall score
    scores = report["scores"]
    report["overall_score"] = round(sum(scores.values()) / len(scores), 1)

    return report


def _check_identical_classes(files: dict[str, str]) -> list[str]:
    """Check for classes with identical method bodies."""
    identical = []
    class_bodies: dict[str, str] = {}

    for fname, code in files.items():
        try:
            tree = ast.parse(code)
        except SyntaxError:
            continue
        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef):
                # Hash the method bodies
                method_code = ""
                for n in node.body:
                    if isinstance(n, ast.FunctionDef):
                        try:
                            method_code += ast.unparse(n) + "\n"
                        except Exception:
                            pass
                if method_code:
                    key = hash(method_code)
                    if key in class_bodies:
                        identical.append(
                            f"{class_bodies[key]} == {node.name}"
                        )
                    else:
                        class_bodies[key] = node.name
    return identical


# ---------------------------------------------------------------------------
# Sandbox execution
# ---------------------------------------------------------------------------

def run_in_sandbox(
    files: dict[str, str],
    output_dir: Path,
    config_path: str | None = None,
    timeout_sec: int = 300,
) -> dict:
    """Run generated code in subprocess (or Docker sandbox if available)."""
    # Write files
    code_dir = output_dir / "experiment"
    code_dir.mkdir(parents=True, exist_ok=True)
    for fname, code in files.items():
        (code_dir / fname).write_text(code, encoding="utf-8")

    # Try to run with subprocess as fallback
    import subprocess
    main_py = code_dir / "main.py"
    if not main_py.exists():
        return {"status": "failed", "reason": "no main.py"}

    print(f"  Running in subprocess (timeout={timeout_sec}s)...")
    try:
        proc = subprocess.run(
            [sys.executable, str(main_py)],
            cwd=str(code_dir),
            capture_output=True,
            text=True,
            timeout=timeout_sec,
            env={**os.environ, "PYTHONPATH": str(code_dir)},
        )
        stdout = proc.stdout
        stderr = proc.stderr
        returncode = proc.returncode
        timed_out = False
    except subprocess.TimeoutExpired:
        stdout = ""
        stderr = "TIMEOUT"
        returncode = -1
        timed_out = True

    # Parse results
    result = {
        "status": "success" if returncode == 0 else "failed",
        "returncode": returncode,
        "timed_out": timed_out,
        "stdout_lines": len(stdout.split("\n")) if stdout else 0,
        "stderr_lines": len(stderr.split("\n")) if stderr else 0,
        "conditions_found": [],
        "metrics_found": {},
        "has_metric_def": False,
        "has_registered_conditions": False,
    }

    # Parse stdout for conditions and metrics
    if stdout:
        for line in stdout.split("\n"):
            if line.startswith("METRIC_DEF:"):
                result["has_metric_def"] = True
            elif line.startswith("REGISTERED_CONDITIONS:"):
                result["has_registered_conditions"] = True
                conds = line.split(":", 1)[1].strip()
                result["conditions_found"] = [c.strip() for c in conds.split(",")]
            elif "condition=" in line:
                m = re.match(r"condition=(\S+)\s+(\S+):\s+(\S+)", line)
                if m:
                    cond, metric_name, value = m.groups()
                    if cond not in result["metrics_found"]:
                        result["metrics_found"][cond] = {}
                    try:
                        result["metrics_found"][cond][metric_name] = float(value)
                    except ValueError:
                        pass

    # Score execution
    exec_score = 0
    if returncode == 0:
        exec_score += 3  # runs
    if result["has_metric_def"]:
        exec_score += 1
    if result["has_registered_conditions"]:
        exec_score += 1
    if result["conditions_found"]:
        exec_score += min(3, len(result["conditions_found"]))  # up to 3 for conditions
    if result["metrics_found"]:
        exec_score += 2  # produces metrics
    result["exec_score"] = min(10, exec_score)

    # Save stdout/stderr
    (output_dir / "stdout.txt").write_text(stdout or "(empty)", encoding="utf-8")
    (output_dir / "stderr.txt").write_text(stderr or "(empty)", encoding="utf-8")

    return result


# ---------------------------------------------------------------------------
# Load experiment plan from previous run
# ---------------------------------------------------------------------------

def load_from_run(run_dir: str) -> dict:
    """Load experiment plan and config from a previous pipeline run."""
    run_path = Path(run_dir)
    if not run_path.exists():
        print(f"ERROR: Run directory not found: {run_dir}")
        sys.exit(1)

    # Find exp_plan.yaml
    plan_path = None
    for s9_dir in sorted(run_path.glob("stage-09*"), reverse=True):
        candidate = s9_dir / "exp_plan.yaml"
        if candidate.exists():
            plan_path = candidate
            break

    if plan_path is None:
        print(f"ERROR: No exp_plan.yaml found in {run_dir}/stage-09*/")
        sys.exit(1)

    exp_plan = plan_path.read_text(encoding="utf-8")

    # Load topic from config or stage-01
    topic = ""
    for topic_file in ["topic_evaluation.json", "topic.json"]:
        for s_dir in sorted(run_path.glob("stage-0[12]*"), reverse=True):
            tf = s_dir / topic_file
            if tf.exists():
                try:
                    td = json.loads(tf.read_text(encoding="utf-8"))
                    topic = td.get("topic", "") or td.get("research_topic", "")
                    if topic:
                        break
                except Exception:
                    pass
        if topic:
            break

    # Try to extract topic from exp_plan if not found elsewhere
    if not topic:
        import yaml
        try:
            plan_data = yaml.safe_load(exp_plan)
            topic = plan_data.get("topic", "Unknown Topic")
        except Exception:
            topic = "Unknown Topic"

    return {
        "name": f"From {run_path.name}",
        "topic": topic,
        "exp_plan": exp_plan,
        "metric": "primary_metric",
        "metric_direction": "maximize",
    }


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Test code generation quality with optional sandbox execution"
    )
    parser.add_argument("--model", default="gpt-5.1", help="Model to use")
    parser.add_argument("--test-id", type=int, default=0, help="Test case ID (0=all)")
    parser.add_argument("--from-run", default="", help="Load exp plan from run dir")
    parser.add_argument("--no-sandbox", action="store_true", help="Skip sandbox execution")
    parser.add_argument("--sandbox-timeout", type=int, default=300, help="Sandbox timeout (sec)")
    parser.add_argument("--output-dir", default="test_outputs_codegen", help="Output dir")
    parser.add_argument("--config", default="config_run20.yaml", help="Config file for LLM")
    args = parser.parse_args()

    # Setup LLM client
    # Try loading from config file first
    config_path = Path(args.config)
    if config_path.exists():
        import yaml
        with open(config_path) as f:
            cfg = yaml.safe_load(f)
        llm_cfg = cfg.get("llm", {})
        base_url = llm_cfg.get("base_url", "")
        api_key = llm_cfg.get("api_key", "") or os.environ.get(
            llm_cfg.get("api_key_env", "OPENAI_API_KEY"), ""
        )
    else:
        base_url = os.environ.get("OPENAI_BASE_URL", "")
        api_key = os.environ.get("OPENAI_API_KEY", "")

    if not base_url or not api_key:
        print("ERROR: Need LLM config. Provide --config or set env vars.")
        sys.exit(1)

    llm_config = LLMConfig(
        base_url=base_url,
        api_key=api_key,
        primary_model=args.model,
        fallback_models=["gpt-4.1", "gpt-4o"],
        max_tokens=16384,
        temperature=0.7,
        timeout_sec=300,
    )
    llm = LLMClient(llm_config)

    # Connectivity test
    print(f"Testing LLM ({args.model})...", end=" ", flush=True)
    ok, msg = llm.preflight()
    if not ok:
        print(f"FAILED: {msg}")
        sys.exit(1)
    print("OK")

    pm = PromptManager()

    # Select test cases
    if args.from_run:
        cases = {99: load_from_run(args.from_run)}
    elif args.test_id > 0:
        if args.test_id not in TEST_CASES:
            print(f"ERROR: Unknown test ID {args.test_id}. Available: {list(TEST_CASES.keys())}")
            sys.exit(1)
        cases = {args.test_id: TEST_CASES[args.test_id]}
    else:
        cases = dict(TEST_CASES)

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    all_reports = []

    for test_id, tc in cases.items():
        print(f"\n{'='*70}")
        print(f"  Test {test_id}: {tc['name']}")
        print(f"{'='*70}")

        stage_dir = output_dir / f"test_{test_id}"
        stage_dir.mkdir(parents=True, exist_ok=True)

        # Configure CodeAgent
        agent_config = CodeAgentConfig(
            architecture_planning=True,
            exec_fix_max_iterations=0,  # no sandbox in generation phase
            tree_search_enabled=False,
            review_max_rounds=2,
        )

        agent = CodeAgent(
            llm=llm,
            prompts=pm,
            config=agent_config,
            stage_dir=stage_dir,
        )

        # Build pkg_hint
        pkg_hint = (
            "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, torch, "
            "torchvision, torchaudio, matplotlib, seaborn, scipy, tqdm, "
            "torchdiffeq, gymnasium, networkx, PyYAML, Pillow, transformers, "
            "datasets, accelerate, peft, timm, einops, torchmetrics.\n"
            "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). "
            "Use `device = torch.device('cuda')` for tensor operations.\n"
        )

        metric_dir = tc.get("metric_direction", "maximize")
        pkg_hint += f"\nMETRIC DIRECTION: {metric_dir}\n"

        # Add compute budget
        pkg_hint += (
            "\n## Compute Budget Constraint\n"
            "- Total execution time limit: 300 seconds\n"
            "- Design experiments that complete within this budget\n"
            "- Implement a time guard: stop gracefully at 80% of budget\n"
        )

        # Generate
        t0 = time.time()
        result = agent.generate(
            topic=tc["topic"],
            exp_plan=tc["exp_plan"],
            metric=tc.get("metric", "primary_metric"),
            pkg_hint=pkg_hint,
            max_tokens=16384,
        )
        gen_elapsed = time.time() - t0

        print(f"\n  Generation: {gen_elapsed:.1f}s, {result.total_llm_calls} LLM calls")
        print(f"  Architecture spec: {len(result.architecture_spec)} chars")
        print(f"  Review rounds: {result.review_rounds}")

        # Write files
        for fname, code in result.files.items():
            fpath = stage_dir / fname
            fpath.parent.mkdir(parents=True, exist_ok=True)
            fpath.write_text(code, encoding="utf-8")
            print(f"  -> {fname}: {len(code.split(chr(10)))} lines")

        if result.architecture_spec:
            (stage_dir / "architecture_spec.yaml").write_text(
                result.architecture_spec, encoding="utf-8"
            )

        # Quality analysis
        report = analyze_code_quality(result.files, tc)
        report["generation_time_sec"] = round(gen_elapsed, 1)
        report["llm_calls"] = result.total_llm_calls

        # Sandbox execution
        exec_result = {"status": "skipped"}
        if not args.no_sandbox and result.files:
            exec_result = run_in_sandbox(
                result.files, stage_dir,
                timeout_sec=args.sandbox_timeout,
            )
            report["execution"] = exec_result
            print(f"\n  Execution: {exec_result['status']}")
            if exec_result.get("returncode") is not None:
                print(f"    Return code: {exec_result['returncode']}")
            if exec_result.get("conditions_found"):
                print(f"    Conditions: {', '.join(exec_result['conditions_found'])}")
            if exec_result.get("metrics_found"):
                for cond, metrics in exec_result["metrics_found"].items():
                    print(f"    {cond}: {metrics}")

        # Print scores
        print(f"\n  --- Scores ---")
        for k, v in report["scores"].items():
            print(f"    {k}: {v}/10")
        if exec_result.get("exec_score") is not None:
            print(f"    execution: {exec_result['exec_score']}/10")
        print(f"    OVERALL: {report['overall_score']}/10")

        if report["issues"]:
            print(f"\n  Issues:")
            for issue in report["issues"]:
                print(f"    - {issue}")

        # Save report
        (stage_dir / "quality_report.json").write_text(
            json.dumps(report, indent=2, default=str), encoding="utf-8"
        )
        all_reports.append(report)

    # Summary
    if len(all_reports) > 1:
        print(f"\n{'='*70}")
        print("  SUMMARY")
        print(f"{'='*70}")
        for r in all_reports:
            exec_info = ""
            if "execution" in r:
                exec_info = f" | exec: {r['execution'].get('status', '?')}"
            print(
                f"  {r['test_name']}: {r['overall_score']}/10 "
                f"({r['effective_lines']} lines, "
                f"{len(r['classes_found'])} classes{exec_info})"
            )
        avg = sum(r["overall_score"] for r in all_reports) / len(all_reports)
        print(f"\n  Average: {avg:.1f}/10")

    (output_dir / "summary.json").write_text(
        json.dumps(all_reports, indent=2, default=str), encoding="utf-8"
    )
    print(f"\nAll outputs saved to: {output_dir}/")


if __name__ == "__main__":
    main()


================================================
FILE: sentinel.sh
================================================
#!/usr/bin/env bash
# sentinel.sh — Watchdog for AutoResearchClaw pipeline process.
#
# Monitors the pipeline heartbeat file and auto-restarts on crash.
# Inspired by Sibyl's sentinel watchdog design.
#
# Usage:
#   ./sentinel.sh <run_dir> [--python <python_path>]
#
# The pipeline runner writes heartbeat.json after each stage. If the
# heartbeat goes stale (>5 min) and the PID is dead, sentinel restarts.
#
# Configuration via environment:
#   SENTINEL_CHECK_INTERVAL  — seconds between checks (default: 60)
#   SENTINEL_STALE_THRESHOLD — seconds before heartbeat is stale (default: 300)
#   SENTINEL_MAX_RETRIES     — max restart attempts (default: 5)
#   SENTINEL_COOLDOWN        — seconds to wait after 3 consecutive failures (default: 360)

set -euo pipefail

# --- Arguments ---
RUN_DIR="${1:?Usage: sentinel.sh <run_dir> [--python <path>]}"
PYTHON_PATH="python"
shift
while [[ $# -gt 0 ]]; do
    case "$1" in
        --python)
            PYTHON_PATH="$2"
            shift 2
            ;;
        *)
            echo "Unknown argument: $1" >&2
            exit 1
            ;;
    esac
done

# --- Configuration ---
CHECK_INTERVAL="${SENTINEL_CHECK_INTERVAL:-60}"
STALE_THRESHOLD="${SENTINEL_STALE_THRESHOLD:-300}"
MAX_RETRIES="${SENTINEL_MAX_RETRIES:-5}"
COOLDOWN="${SENTINEL_COOLDOWN:-360}"

HEARTBEAT_FILE="${RUN_DIR}/heartbeat.json"
RECOVERY_LOG="${RUN_DIR}/sentinel_recovery.log"
FAILED_LOG="${RUN_DIR}/sentinel_failed.log"

retry_count=0
consecutive_failures=0

log() {
    local msg="[sentinel $(date '+%Y-%m-%dT%H:%M:%S')] $1"
    echo "$msg"
    echo "$msg" >> "$RECOVERY_LOG"
}

# --- Check if heartbeat is stale ---
is_stale() {
    if [[ ! -f "$HEARTBEAT_FILE" ]]; then
        return 0  # No heartbeat = stale
    fi

    local now
    now=$(date +%s)

    # Extract timestamp from heartbeat.json
    local hb_ts
    hb_ts=$(python3 -c "
import json, sys
try:
    data = json.load(open('${HEARTBEAT_FILE}'))
    from datetime import datetime
    ts = datetime.fromisoformat(data['timestamp'])
    print(int(ts.timestamp()))
except Exception:
    print(0)
" 2>/dev/null || echo 0)

    local age=$(( now - hb_ts ))
    [[ $age -gt $STALE_THRESHOLD ]]
}

# --- Check if PID is alive ---
pid_alive() {
    local pid_file="${RUN_DIR}/pipeline.pid"
    if [[ ! -f "$pid_file" ]]; then
        return 1
    fi
    local pid
    pid=$(cat "$pid_file" 2>/dev/null || echo "")
    if [[ -z "$pid" ]]; then
        return 1
    fi
    kill -0 "$pid" 2>/dev/null
}

# --- Check for active subprocesses ---
has_active_children() {
    local pid_file="${RUN_DIR}/pipeline.pid"
    if [[ ! -f "$pid_file" ]]; then
        return 1
    fi
    local pid
    pid=$(cat "$pid_file" 2>/dev/null || echo "")
    if [[ -z "$pid" ]]; then
        return 1
    fi
    # Check if any child processes exist
    pgrep -P "$pid" > /dev/null 2>&1
}

# --- Restart pipeline ---
restart_pipeline() {
    log "Attempting pipeline restart (attempt $((retry_count + 1))/${MAX_RETRIES})"

    $PYTHON_PATH -m researchclaw run --resume --output "$RUN_DIR" &
    local new_pid=$!
    echo "$new_pid" > "${RUN_DIR}/pipeline.pid"

    log "Pipeline restarted with PID ${new_pid}"
    retry_count=$((retry_count + 1))
}

# --- Main loop ---
log "Sentinel started for ${RUN_DIR}"
log "Check interval: ${CHECK_INTERVAL}s, Stale threshold: ${STALE_THRESHOLD}s"
log "Max retries: ${MAX_RETRIES}, Cooldown: ${COOLDOWN}s"

while true; do
    sleep "$CHECK_INTERVAL"

    # If PID is alive, reset failure counter
    if pid_alive; then
        consecutive_failures=0
        continue
    fi

    # PID is dead — check if heartbeat is stale
    if ! is_stale; then
        # Heartbeat is fresh but PID is gone — might have just exited normally
        continue
    fi

    # Don't interrupt active subprocesses
    if has_active_children; then
        log "Active subprocesses detected — skipping restart"
        continue
    fi

    # Check retry limit
    if [[ $retry_count -ge $MAX_RETRIES ]]; then
        log "Max retries (${MAX_RETRIES}) reached — sentinel giving up"
        echo "Sentinel failed after ${MAX_RETRIES} retries at $(date)" >> "$FAILED_LOG"
        exit 1
    fi

    # Cooldown after consecutive failures
    consecutive_failures=$((consecutive_failures + 1))
    if [[ $consecutive_failures -ge 3 ]]; then
        log "3 consecutive failures — cooling down for ${COOLDOWN}s"
        sleep "$COOLDOWN"
        consecutive_failures=0
    fi

    restart_pipeline
done


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/conftest.py
================================================
# conftest.py — shared pytest fixtures for researchclaw tests


================================================
FILE: tests/e2e_docker_sandbox.py
================================================
#!/usr/bin/env python3
"""End-to-end verification for Docker sandbox.

Run after building the image:
    docker build -t researchclaw/experiment:latest researchclaw/docker/
    python tests/e2e_docker_sandbox.py
"""

from __future__ import annotations

import json
import sys
import tempfile
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from researchclaw.config import DockerSandboxConfig, ExperimentConfig
from researchclaw.experiment.docker_sandbox import DockerSandbox
from researchclaw.experiment.factory import create_sandbox

PASS = "\033[92mPASS\033[0m"
FAIL = "\033[91mFAIL\033[0m"
SKIP = "\033[93mSKIP\033[0m"

results: list[tuple[str, bool, str]] = []


def check(name: str, ok: bool, detail: str = "") -> None:
    results.append((name, ok, detail))
    tag = PASS if ok else FAIL
    msg = f"  [{tag}] {name}"
    if detail:
        msg += f" — {detail}"
    print(msg)


def main() -> None:
    print("=" * 60)
    print("Docker Sandbox End-to-End Verification")
    print("=" * 60)

    # ── Preflight ──────────────────────────────────────────────
    print("\n--- Preflight ---")
    docker_ok = DockerSandbox.check_docker_available()
    check("Docker daemon reachable", docker_ok)
    if not docker_ok:
        print("\nDocker is not available. Cannot proceed.")
        sys.exit(1)

    image_ok = DockerSandbox.ensure_image("researchclaw/experiment:latest")
    check("Image exists locally", image_ok)
    if not image_ok:
        print("\nImage not found. Build it first:")
        print("  docker build -t researchclaw/experiment:latest researchclaw/docker/")
        sys.exit(1)

    # ── Test 1: Basic execution + metrics ──────────────────────
    print("\n--- Test 1: Basic execution + metrics ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none")
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")
        code = (
            "import numpy as np\n"
            "x = np.random.randn(100)\n"
            "print(f'primary_metric: {float(np.mean(x**2)):.4f}')\n"
            "print(f'std: {float(np.std(x)):.4f}')\n"
            "print('Done.')\n"
        )
        r = sandbox.run(code, timeout_sec=60)
        check("returncode == 0", r.returncode == 0, f"rc={r.returncode}")
        check("metrics parsed", "primary_metric" in r.metrics, str(r.metrics))
        check("stdout non-empty", bool(r.stdout.strip()), repr(r.stdout[:100]))
        check("timed_out is False", r.timed_out is False)
        check("elapsed_sec > 0", r.elapsed_sec > 0, f"{r.elapsed_sec:.2f}s")

    # ── Test 2: Multi-file project ─────────────────────────────
    print("\n--- Test 2: Multi-file project ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none")
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")

        project = Path(tmp) / "project"
        project.mkdir()
        (project / "utils.py").write_text(
            "def add(a, b): return a + b\n", encoding="utf-8"
        )
        (project / "main.py").write_text(
            "from utils import add\n"
            "result = add(3, 4)\n"
            "print(f'primary_metric: {result}')\n",
            encoding="utf-8",
        )
        r = sandbox.run_project(project, timeout_sec=60)
        check("project returncode == 0", r.returncode == 0, f"rc={r.returncode}")
        check("project metric correct", r.metrics.get("primary_metric") == 7.0,
              str(r.metrics))

    # ── Test 3: results.json ───────────────────────────────────
    print("\n--- Test 3: results.json from volume ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none")
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")
        code = (
            "import json\n"
            "results = {'accuracy': 0.92, 'f1': 0.88}\n"
            "with open('results.json', 'w') as f:\n"
            "    json.dump(results, f)\n"
            "print('primary_metric: 0.92')\n"
        )
        r = sandbox.run(code, timeout_sec=60)
        check("results.json metric merged", "f1" in r.metrics,
              str(r.metrics))

    # ── Test 4: Network isolation ──────────────────────────────
    print("\n--- Test 4: Network isolation ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none")
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")
        code = (
            "import urllib.request\n"
            "try:\n"
            "    urllib.request.urlopen('http://example.com', timeout=5)\n"
            "    print('NETWORK_ACCESS: yes')\n"
            "except Exception as e:\n"
            "    print('NETWORK_ACCESS: no')\n"
            "    print(f'primary_metric: 1.0')\n"
        )
        r = sandbox.run(code, timeout_sec=30)
        network_blocked = "NETWORK_ACCESS: no" in r.stdout
        check("Network blocked (--network=none)", network_blocked,
              r.stdout.strip()[:200])

    # ── Test 5: GPU visibility ─────────────────────────────────
    print("\n--- Test 5: GPU visibility ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        cfg = DockerSandboxConfig(gpu_enabled=True, network_policy="none")
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")
        code = (
            "import torch\n"
            "gpu_available = torch.cuda.is_available()\n"
            "if gpu_available:\n"
            "    print(f'GPU: {torch.cuda.get_device_name(0)}')\n"
            "    print('primary_metric: 1.0')\n"
            "else:\n"
            "    print('GPU: none')\n"
            "    print('primary_metric: 0.0')\n"
        )
        r = sandbox.run(code, timeout_sec=60)
        gpu_visible = "primary_metric" in r.metrics and r.metrics["primary_metric"] == 1.0
        if gpu_visible:
            check("GPU visible in container", True, r.stdout.strip()[:200])
        else:
            # Not a hard failure — might not have NVIDIA runtime
            print(f"  [{SKIP}] GPU not visible (NVIDIA Container Toolkit may not be installed)")
            print(f"         stdout: {r.stdout.strip()[:200]}")
            print(f"         stderr: {r.stderr.strip()[:200]}")

    # ── Test 6: Memory limit ──────────────────────────────────
    print("\n--- Test 6: Memory limit enforcement ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        # Set a very low memory limit to trigger OOM
        cfg = DockerSandboxConfig(
            gpu_enabled=False, network_policy="none", memory_limit_mb=64
        )
        sandbox = DockerSandbox(cfg, Path(tmp) / "work")
        code = (
            "import numpy as np\n"
            "# Allocate ~200MB to exceed 64MB limit\n"
            "x = np.ones((25_000_000,), dtype=np.float64)\n"
            "print(f'primary_metric: {x.sum()}')\n"
        )
        r = sandbox.run(code, timeout_sec=30)
        oom = r.returncode != 0
        check("OOM kills container (64MB limit, 200MB alloc)", oom,
              f"rc={r.returncode}, stderr={r.stderr.strip()[:200]}")

    # ── Test 7: Factory integration ────────────────────────────
    print("\n--- Test 7: Factory integration ---")
    with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp:
        config = ExperimentConfig(mode="docker", docker=DockerSandboxConfig(gpu_enabled=False))
        sandbox = create_sandbox(config, Path(tmp) / "work")
        check("Factory returns DockerSandbox", isinstance(sandbox, DockerSandbox))
        r = sandbox.run("print('primary_metric: 42.0')", timeout_sec=30)
        check("Factory sandbox executes", r.returncode == 0 and r.metrics.get("primary_metric") == 42.0,
              str(r.metrics))

    # ── Summary ────────────────────────────────────────────────
    print("\n" + "=" * 60)
    passed = sum(1 for _, ok, _ in results if ok)
    failed = sum(1 for _, ok, _ in results if not ok)
    print(f"Results: {passed} passed, {failed} failed")
    if failed:
        print("\nFailed tests:")
        for name, ok, detail in results:
            if not ok:
                print(f"  - {name}: {detail}")
        sys.exit(1)
    else:
        print("All tests passed!")


if __name__ == "__main__":
    main()


================================================
FILE: tests/e2e_real_llm.py
================================================
#!/usr/bin/env python3
"""Real E2E test: run all 22 stages with actual LLM API calls.

Usage:
    .venv_arc/bin/python3 tests/e2e_real_llm.py
"""

from __future__ import annotations

import json
import sys
import time
from pathlib import Path

import yaml

# Ensure project root is on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from researchclaw.config import RCConfig
from researchclaw.adapters import AdapterBundle
from researchclaw.llm.client import LLMClient
from researchclaw.pipeline.stages import Stage, STAGE_SEQUENCE
from researchclaw.pipeline.executor import execute_stage, StageResult
from researchclaw.pipeline.runner import execute_pipeline


def main() -> None:
    # --- Load config ---
    config_path = Path("config.arc.yaml")
    if not config_path.exists():
        print("ERROR: config.arc.yaml not found")
        sys.exit(1)

    with open(config_path) as f:
        raw = yaml.safe_load(f)

    # Override for test
    raw["research"]["topic"] = (
        "Efficient Attention Mechanisms for Long-Context Language Models"
    )
    raw["experiment"]["mode"] = "sandbox"
    raw["experiment"]["time_budget_sec"] = 60
    raw["experiment"]["max_iterations"] = 3

    config = RCConfig.from_dict(raw, check_paths=False)
    adapters = AdapterBundle()

    # --- Create run directory ---
    run_dir = Path("artifacts/e2e-real-llm-run")
    run_dir.mkdir(parents=True, exist_ok=True)
    run_id = f"e2e-real-{int(time.time())}"

    print(f"=" * 70)
    print(f"ResearchClaw E2E Test — Real LLM API")
    print(f"Topic: {config.research.topic}")
    print(f"Run ID: {run_id}")
    print(f"Output: {run_dir}")
    print(f"=" * 70)

    # --- Run full pipeline ---
    start = time.time()
    results = execute_pipeline(
        run_dir=run_dir,
        run_id=run_id,
        config=config,
        adapters=adapters,
        auto_approve_gates=True,  # Auto-approve all gates for E2E test
        kb_root=run_dir / "kb",
    )
    total_time = time.time() - start

    # --- Report ---
    print(f"\n{'=' * 70}")
    print(f"RESULTS: {len(results)}/22 stages executed in {total_time:.1f}s")
    print(f"{'=' * 70}")

    passed = 0
    failed = 0
    for r in results:
        status_icon = "✅" if r.status.value == "done" else "❌"
        print(
            f"  {status_icon} Stage {int(r.stage):02d} {r.stage.name}: {r.status.value} | artifacts: {r.artifacts}"
        )
        if r.status.value == "done":
            passed += 1
        else:
            failed += 1

    print(f"\n{'=' * 70}")
    print(f"SUMMARY: {passed} passed, {failed} failed, {total_time:.1f}s total")
    print(f"{'=' * 70}")

    # --- Validate key artifacts ---
    checks = [
        ("Stage 1 goal.md", "stage-01/goal.md"),
        ("Stage 10 experiment.py", "stage-10/experiment.py"),
        ("Stage 12 runs/", "stage-12/runs"),
        ("Stage 14 experiment_summary.json", "stage-14/experiment_summary.json"),
        ("Stage 17 paper_draft.md", "stage-17/paper_draft.md"),
        ("Stage 22 export files", "stage-22"),
    ]
    print("\nArtifact Checks:")
    for label, path in checks:
        full = run_dir / path
        exists = full.exists()
        if full.is_file():
            size = full.stat().st_size
            print(f"  {'✅' if exists else '❌'} {label}: {size} bytes")
        elif full.is_dir():
            count = len(list(full.iterdir())) if exists else 0
            print(f"  {'✅' if exists else '❌'} {label}: {count} items")
        else:
            print(f"  {'❌'} {label}: NOT FOUND")

    # --- Check experiment_summary.json has real data ---
    summary_path = run_dir / "stage-14" / "experiment_summary.json"
    if summary_path.exists():
        summary = json.loads(summary_path.read_text())
        has_metrics = bool(summary.get("metrics_summary"))
        print(
            f"\n  📊 Experiment summary has real metrics: {'YES' if has_metrics else 'NO'}"
        )
        if has_metrics:
            for k, v in summary["metrics_summary"].items():
                print(f"     - {k}: {v}")

    # --- Check paper draft has real data (not placeholder) ---
    draft_path = run_dir / "stage-17" / "paper_draft.md"
    if draft_path.exists():
        draft = draft_path.read_text()
        has_placeholder = "no quantitative results yet" in draft.lower()
        has_template = draft.count("Template") > 3
        print(
            f"  📝 Paper draft: {len(draft)} chars, placeholder={has_placeholder}, template={has_template}"
        )

    # --- Check validation report ---
    val_report = run_dir / "stage-10" / "validation_report.md"
    if val_report.exists():
        print(f"  🔍 Code validation report: {val_report.stat().st_size} bytes")
        print(f"     {val_report.read_text()[:200]}")

    # Final verdict
    if passed == 22 and failed == 0:
        print(f"\n🎉 ALL 22 STAGES PASSED!")
        sys.exit(0)
    else:
        print(f"\n⚠️  {failed} stages did not pass.")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tests/test_anthropic.py
================================================
"""测试 Anthropic Messages 兼容 API 是否可用。"""

from __future__ import annotations

import os
from typing import Any

import httpx
import pytest

pytestmark = pytest.mark.skipif(
    "ANTHROPIC_API_KEY" not in os.environ,
    reason="ANTHROPIC_API_KEY not set",
)

BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
MODEL = os.environ.get("ANTHROPIC_MODEL", "claude-haiku-4-5-20251001")


def _create_message() -> dict[str, Any]:
    url = f"{BASE_URL.rstrip('/')}/v1/messages"
    headers = {
        "content-type": "application/json",
        "anthropic-version": "2023-06-01",
        "x-api-key": API_KEY,
    }
    payload = {
        "model": MODEL,
        "max_tokens": 256,
        "messages": [{"role": "user", "content": "Say hello in one sentence."}],
    }

    with httpx.Client(timeout=30.0) as client:
        response = client.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()


def test_anthropic_api() -> None:
    message = _create_message()
    usage = message.get("usage", {})
    content = message.get("content", [])
    text_blocks = [block.get("text", "") for block in content if block.get("type") == "text"]

    print(f"Status: stop_reason={message.get('stop_reason')}")
    print(f"Model: {message.get('model')}")
    print(f"Usage: input={usage.get('input_tokens')}, output={usage.get('output_tokens')}")
    print(f"Response: {' '.join(text_blocks)}")

    assert message.get("type") == "message"
    assert len(content) > 0
    print("\n✅ API 可用!")


if __name__ == "__main__":
    test_anthropic_api()


================================================
FILE: tests/test_assessor.py
================================================
"""Tests for researchclaw.assessor — Paper Quality Assessor (Agent D3).

20+ tests covering rubrics, scorer, venue_recommender, and comparator.
"""

from __future__ import annotations

import asyncio
import json
from pathlib import Path
from typing import Any
from unittest.mock import AsyncMock

import pytest

from researchclaw.assessor.rubrics import RUBRICS, Rubric
from researchclaw.assessor.scorer import PaperScorer
from researchclaw.assessor.venue_recommender import VenueRecommender
from researchclaw.assessor.comparator import HistoryComparator


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


def _sample_paper() -> str:
    return (
        "# Novel Graph Attention Networks\n\n"
        "## Abstract\nWe propose a new method for graph-based learning.\n\n"
        "## Experiments\nWe compare against baseline on CIFAR-10.\n"
        "Results are shown in table 1 and figure 2.\n"
        "Our method achieves 95.2% accuracy.\n"
    ) * 5  # ~500 words


def _sample_scores(overall: float = 7.5) -> dict[str, Any]:
    return {
        "scores": {
            "novelty": 7.0,
            "rigor": 8.0,
            "clarity": 7.0,
            "impact": 7.5,
            "experiments": 8.0,
        },
        "overall": overall,
    }


class MockLLM:
    """Minimal mock LLM client."""

    def __init__(self, response: str = "SCORE: 7\nREASON: Solid contribution"):
        self.response = response

    async def chat_async(self, prompt: str) -> str:
        return self.response


class FailingLLM:
    async def chat_async(self, prompt: str) -> str:
        raise RuntimeError("API error")


# ===================================================================
# Rubric tests
# ===================================================================


class TestRubrics:
    def test_all_five_dimensions_present(self):
        assert set(RUBRICS.keys()) == {
            "novelty", "rigor", "clarity", "impact", "experiments"
        }

    def test_rubric_is_frozen(self):
        r = RUBRICS["novelty"]
        with pytest.raises(AttributeError):
            r.name = "changed"  # type: ignore[misc]

    def test_rubric_has_criteria_and_scale(self):
        for dim, rubric in RUBRICS.items():
            assert rubric.criteria, f"{dim} missing criteria"
            assert rubric.scale, f"{dim} missing scale"

    def test_default_weight(self):
        r = Rubric(name="test", criteria="test criteria", scale="1-10")
        assert r.weight == 1.0


# ===================================================================
# PaperScorer tests
# ===================================================================


class TestPaperScorer:
    def test_score_without_llm(self):
        scorer = PaperScorer()
        result = asyncio.run(scorer.score(_sample_paper()))
        assert "overall" in result
        assert "scores" in result
        assert isinstance(result["overall"], float)
        assert len(result["dimensions_evaluated"]) == 5

    def test_score_with_mock_llm(self):
        llm = MockLLM("SCORE: 8\nREASON: Excellent work")
        scorer = PaperScorer(llm_client=llm)
        result = asyncio.run(scorer.score(_sample_paper()))
        assert result["overall"] == 8.0
        for dim in result["scores"]:
            assert result["scores"][dim] == 8.0

    def test_score_with_failing_llm_falls_back(self):
        scorer = PaperScorer(llm_client=FailingLLM())
        result = asyncio.run(scorer.score(_sample_paper()))
        # Should still return valid scores via heuristic
        assert "overall" in result
        assert result["overall"] > 0

    def test_score_subset_dimensions(self):
        scorer = PaperScorer(dimensions=("novelty", "clarity"))
        result = asyncio.run(scorer.score(_sample_paper()))
        assert len(result["dimensions_evaluated"]) == 2

    def test_parse_score_valid(self):
        score, reason = PaperScorer._parse_score_response(
            "SCORE: 9\nREASON: Breakthrough paper", "novelty"
        )
        assert score == 9.0
        assert reason == "Breakthrough paper"

    def test_parse_score_clamped(self):
        score, _ = PaperScorer._parse_score_response("SCORE: 15", "test")
        assert score == 10.0
        score, _ = PaperScorer._parse_score_response("SCORE: 0", "test")
        assert score == 1.0

    def test_parse_score_missing(self):
        score, reason = PaperScorer._parse_score_response("No format here", "test")
        assert score == 5.0  # default
        assert reason == "No detail provided"

    def test_heuristic_clarity_long_paper(self):
        long_paper = "word " * 4000
        score, detail = PaperScorer._heuristic_score(long_paper, RUBRICS["clarity"])
        assert score == 6.0
        assert "4000" in detail

    def test_heuristic_clarity_short_paper(self):
        short_paper = "word " * 500
        score, _ = PaperScorer._heuristic_score(short_paper, RUBRICS["clarity"])
        assert score == 3.0

    def test_heuristic_experiments_with_table_and_figure(self):
        paper = "Results in table 1 and figure 3 show improvements."
        score, _ = PaperScorer._heuristic_score(paper, RUBRICS["experiments"])
        assert score == 7.0  # 4.0 + 1.5 + 1.5

    def test_heuristic_experiments_no_evidence(self):
        paper = "We discuss theoretical implications."
        score, _ = PaperScorer._heuristic_score(paper, RUBRICS["experiments"])
        assert score == 4.0

    def test_heuristic_default_dimension(self):
        paper = "Some paper content"
        score, reason = PaperScorer._heuristic_score(paper, RUBRICS["novelty"])
        assert score == 5.0
        assert "default" in reason.lower()


# ===================================================================
# VenueRecommender tests
# ===================================================================


class TestVenueRecommender:
    def test_recommend_high_score(self):
        rec = VenueRecommender()
        scores = _sample_scores(overall=9.0)
        results = rec.recommend(scores)
        # Should include tier 1 venues
        tier_1_venues = [r for r in results if r["tier"] == "tier_1"]
        assert len(tier_1_venues) > 0

    def test_recommend_low_score(self):
        rec = VenueRecommender()
        scores = _sample_scores(overall=2.0)
        results = rec.recommend(scores)
        assert len(results) == 0

    def test_recommend_medium_score_no_tier1(self):
        rec = VenueRecommender()
        scores = _sample_scores(overall=5.0)
        results = rec.recommend(scores)
        tier_1 = [r for r in results if r["tier"] == "tier_1"]
        assert len(tier_1) == 0

    def test_recommend_filter_by_domain(self):
        rec = VenueRecommender()
        scores = _sample_scores(overall=9.0)
        results = rec.recommend(scores, domains=["cv"])
        for r in results:
            assert "cv" in r["venue_domains"] or "deep-learning" in r["venue_domains"]

    def test_get_suggestion_weak_dimension(self):
        scores = {"scores": {"novelty": 3, "clarity": 8}, "overall": 5.5}
        suggestion = VenueRecommender._get_suggestion("ICML", scores)
        assert "novelty" in suggestion.lower()
        assert "Strengthen" in suggestion

    def test_get_suggestion_moderate(self):
        scores = {"scores": {"novelty": 6, "clarity": 8}, "overall": 7.0}
        suggestion = VenueRecommender._get_suggestion("ICML", scores)
        assert "improving" in suggestion.lower()

    def test_get_suggestion_strong(self):
        scores = {"scores": {"novelty": 8, "clarity": 9}, "overall": 8.5}
        suggestion = VenueRecommender._get_suggestion("ICML", scores)
        assert "strong" in suggestion.lower()

    def test_get_suggestion_no_scores(self):
        scores = {"overall": 5.0}
        suggestion = VenueRecommender._get_suggestion("ICML", scores)
        assert "Evaluate" in suggestion

    def test_format_recommendations_empty(self):
        rec = VenueRecommender()
        output = rec.format_recommendations([])
        assert "No suitable venues" in output

    def test_format_recommendations_with_data(self):
        rec = VenueRecommender()
        results = rec.recommend(_sample_scores(overall=9.0))
        output = rec.format_recommendations(results)
        assert "Venue Recommendations" in output


# ===================================================================
# HistoryComparator tests
# ===================================================================


class TestHistoryComparator:
    def test_record_and_get_history(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(7.5))
        history = comp.get_history()
        assert len(history) == 1
        assert history[0]["run_id"] == "run-1"

    def test_record_persists_to_disk(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(7.5))
        # Reload from disk
        comp2 = HistoryComparator(history_dir=tmp_path)
        assert len(comp2.get_history()) == 1

    def test_compare_no_history(self):
        comp = HistoryComparator()
        result = comp.compare(_sample_scores(8.0))
        assert result["comparison"] == "no_history"

    def test_compare_with_previous(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(6.0))
        result = comp.compare(_sample_scores(8.0), previous_run_id="run-1")
        assert result["comparison"] == "success"
        assert result["delta"] == 2.0
        assert result["trend"] == "improved"

    def test_compare_stable_trend(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(7.5))
        result = comp.compare(_sample_scores(7.5))
        assert result["trend"] == "stable"

    def test_compare_declined_trend(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(9.0))
        result = comp.compare(_sample_scores(7.0))
        assert result["trend"] == "declined"

    def test_compare_not_found(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(7.0))
        result = comp.compare(_sample_scores(8.0), previous_run_id="nonexistent")
        assert result["comparison"] == "not_found"

    def test_get_best_run(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        comp.record("run-1", "topic A", _sample_scores(6.0))
        comp.record("run-2", "topic B", _sample_scores(9.0))
        comp.record("run-3", "topic C", _sample_scores(7.5))
        best = comp.get_best_run()
        assert best is not None
        assert best["run_id"] == "run-2"

    def test_get_best_run_empty(self):
        comp = HistoryComparator()
        assert comp.get_best_run() is None

    def test_dimension_deltas(self, tmp_path: Path):
        comp = HistoryComparator(history_dir=tmp_path)
        scores_old = {
            "scores": {"novelty": 5.0, "clarity": 6.0},
            "overall": 5.5,
        }
        scores_new = {
            "scores": {"novelty": 7.0, "clarity": 8.0},
            "overall": 7.5,
        }
        comp.record("run-1", "topic A", scores_old)
        result = comp.compare(scores_new, previous_run_id="run-1")
        assert result["dimension_deltas"]["novelty"] == 2.0
        assert result["dimension_deltas"]["clarity"] == 2.0


================================================
FILE: tests/test_benchmark_agent.py
================================================
"""Tests for the BenchmarkAgent multi-agent system."""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import pytest
import yaml


# ---------------------------------------------------------------------------
# Fake LLM client (same pattern as test_code_agent.py)
# ---------------------------------------------------------------------------


@dataclass
class FakeLLMResponse:
    content: str = ""
    model: str = "fake"
    prompt_tokens: int = 10
    completion_tokens: int = 20
    total_tokens: int = 30
    finish_reason: str = "stop"
    truncated: bool = False
    raw: dict = field(default_factory=dict)


class FakeLLM:
    """Fake LLM that returns preconfigured responses."""

    def __init__(self, responses: list[str] | None = None) -> None:
        self._responses = list(responses or [])
        self._idx = 0
        self.calls: list[dict[str, Any]] = []

    def chat(self, messages, **kwargs) -> FakeLLMResponse:
        self.calls.append({"messages": messages, **kwargs})
        if self._idx < len(self._responses):
            content = self._responses[self._idx]
            self._idx += 1
        else:
            content = '{"benchmarks": [], "baselines": []}'
        return FakeLLMResponse(content=content)


# ---------------------------------------------------------------------------
# Knowledge base tests
# ---------------------------------------------------------------------------


class TestBenchmarkKnowledge:
    """Test the benchmark_knowledge.yaml file."""

    def test_knowledge_file_exists(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        assert _KNOWLEDGE_PATH.exists(), f"Knowledge file missing: {_KNOWLEDGE_PATH}"

    def test_knowledge_loads(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
        assert isinstance(data, dict)
        assert "domains" in data

    def test_knowledge_has_domains(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
        domains = data["domains"]
        assert len(domains) >= 10, f"Expected 10+ domains, got {len(domains)}"

    def test_each_domain_has_benchmarks_and_baselines(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
        for did, info in data["domains"].items():
            assert "keywords" in info, f"Domain {did} missing keywords"
            assert "standard_benchmarks" in info, f"Domain {did} missing benchmarks"
            assert "common_baselines" in info, f"Domain {did} missing baselines"
            assert len(info["standard_benchmarks"]) > 0, f"Domain {did} has 0 benchmarks"
            assert len(info["common_baselines"]) > 0, f"Domain {did} has 0 baselines"

    def test_benchmark_entries_have_required_fields(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
        for did, info in data["domains"].items():
            for b in info["standard_benchmarks"]:
                assert "name" in b, f"Benchmark in {did} missing name"
                assert "tier" in b, f"Benchmark {b.get('name')} in {did} missing tier"
                assert b["tier"] in (1, 2, 3), f"Invalid tier for {b.get('name')}"

    def test_baseline_entries_have_required_fields(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH
        data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8"))
        for did, info in data["domains"].items():
            for bl in info["common_baselines"]:
                assert "name" in bl, f"Baseline in {did} missing name"
                assert "source" in bl, f"Baseline {bl.get('name')} in {did} missing source"
                assert "paper" in bl, f"Baseline {bl.get('name')} in {did} missing paper"


# ---------------------------------------------------------------------------
# Surveyor tests
# ---------------------------------------------------------------------------


class TestSurveyor:
    """Test SurveyorAgent domain matching and local search."""

    def test_domain_matching_image_classification(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        domains = agent._match_domains(
            "Image Classification with Contrastive Learning"
        )
        assert "image_classification" in domains

    def test_domain_matching_rl(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        domains = agent._match_domains(
            "Reinforcement Learning for Continuous Control"
        )
        assert "reinforcement_learning" in domains

    def test_domain_matching_knowledge_distillation(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        domains = agent._match_domains(
            "Knowledge Distillation with Feature Alignment"
        )
        assert "knowledge_distillation" in domains

    def test_domain_matching_multiple(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        domains = agent._match_domains(
            "Self-Supervised Contrastive Learning for Image Classification"
        )
        assert len(domains) >= 2

    def test_local_candidates_returns_benchmarks(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        result = agent._get_local_candidates(["image_classification"])
        assert len(result["benchmarks"]) > 0
        assert len(result["baselines"]) > 0

    def test_execute_returns_benchmarks(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        result = agent.execute({
            "topic": "Image Classification with Data Augmentation",
            "hypothesis": "Novel augmentation improves accuracy",
        })
        assert result.success
        assert len(result.data["benchmarks"]) > 0

    def test_execute_with_unknown_topic_uses_llm_fallback(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        llm = FakeLLM([json.dumps({
            "benchmarks": [{"name": "CustomDS", "tier": 2}],
            "baselines": [{"name": "CustomBL", "source": "custom", "paper": "X"}],
            "rationale": "test",
        })])
        agent = SurveyorAgent(llm, enable_hf_search=False)
        result = agent.execute({
            "topic": "Completely Novel Alien Technology Classification",
            "hypothesis": "",
        })
        assert result.success
        assert result.data["llm_fallback_used"]

    def test_extract_search_keywords(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        kws = SurveyorAgent._extract_search_keywords(
            "Novel Approach for Image Classification using Contrastive Learning"
        )
        assert len(kws) >= 1
        for kw in kws:
            assert "novel" not in kw.lower()
            assert "using" not in kw.lower()

    def test_execute_empty_topic_fails(self) -> None:
        from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent
        agent = SurveyorAgent(FakeLLM(), enable_hf_search=False)
        result = agent.execute({"topic": ""})
        assert not result.success


# ---------------------------------------------------------------------------
# Selector tests
# ---------------------------------------------------------------------------


class TestSelector:
    """Test SelectorAgent filtering and ranking logic."""

    @pytest.fixture()
    def benchmarks(self) -> list[dict]:
        return [
            {"name": "CIFAR-10", "tier": 1, "size_mb": 170, "origin": "knowledge_base",
             "metrics": ["accuracy"]},
            {"name": "CIFAR-100", "tier": 1, "size_mb": 170, "origin": "knowledge_base",
             "metrics": ["accuracy"]},
            {"name": "Tiny-ImageNet", "tier": 2, "size_mb": 237, "origin": "knowledge_base",
             "metrics": ["top1_accuracy"]},
            {"name": "ImageNet-1K", "tier": 3, "size_mb": 168000, "origin": "knowledge_base",
             "metrics": ["top1_accuracy"]},
            {"name": "hf/custom-ds", "tier": 2, "size_mb": 500, "origin": "huggingface_hub",
             "downloads": 1000},
        ]

    @pytest.fixture()
    def baselines(self) -> list[dict]:
        return [
            {"name": "ResNet-18", "origin": "knowledge_base", "pip": [],
             "paper": "He et al."},
            {"name": "ViT-B/16", "origin": "knowledge_base", "pip": ["timm"],
             "paper": "Dosovitskiy et al."},
        ]

    def test_filter_excludes_tier3(self, benchmarks: list[dict]) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent
        agent = SelectorAgent(FakeLLM(), tier_limit=2)
        filtered = agent._filter_benchmarks(benchmarks)
        names = [b["name"] for b in filtered]
        assert "ImageNet-1K" not in names
        assert "CIFAR-10" in names

    def test_filter_network_none_only_tier1(self, benchmarks: list[dict]) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent
        agent = SelectorAgent(FakeLLM(), network_policy="none")
        filtered = agent._filter_benchmarks(benchmarks)
        for b in filtered:
            assert b["tier"] == 1

    def test_ranking_prefers_tier1(self, benchmarks: list[dict]) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent
        agent = SelectorAgent(FakeLLM())
        filtered = agent._filter_benchmarks(benchmarks)
        ranked = agent._rank_benchmarks(filtered)
        # Tier 1 should come first
        assert ranked[0]["tier"] == 1

    def test_ranking_prefers_knowledge_base(self, benchmarks: list[dict]) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent
        agent = SelectorAgent(FakeLLM())
        filtered = agent._filter_benchmarks(benchmarks)
        ranked = agent._rank_benchmarks(filtered)
        # Knowledge base entries should precede HF entries of same tier
        kb_indices = [i for i, b in enumerate(ranked) if b["origin"] == "knowledge_base"]
        hf_indices = [i for i, b in enumerate(ranked) if b["origin"] == "huggingface_hub"]
        if kb_indices and hf_indices:
            assert min(kb_indices) < min(hf_indices)

    def test_execute_selects_minimum(self, benchmarks: list[dict],
                                     baselines: list[dict]) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent
        llm = FakeLLM([json.dumps({
            "primary_benchmark": "CIFAR-10",
            "secondary_benchmarks": ["CIFAR-100"],
            "selected_baselines": ["ResNet-18", "ViT-B/16"],
            "rationale": "Standard benchmarks",
            "experiment_notes": "",
        })])
        agent = SelectorAgent(llm, min_benchmarks=1, min_baselines=2)
        result = agent.execute({
            "topic": "Image Classification",
            "survey": {"benchmarks": benchmarks, "baselines": baselines},
        })
        assert result.success
        assert len(result.data["selected_benchmarks"]) >= 1
        assert len(result.data["selected_baselines"]) >= 2


# ---------------------------------------------------------------------------
# Acquirer tests
# ---------------------------------------------------------------------------


class TestAcquirer:
    """Test AcquirerAgent code generation."""

    def test_generate_setup_script_tier1_only(self) -> None:
        from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
        agent = AcquirerAgent(FakeLLM())
        script = agent._generate_setup_script(
            [{"name": "CIFAR-10", "tier": 1, "api": "torchvision..."}], []
        )
        # Tier 1 datasets don't need setup scripts
        assert script == ""

    def test_generate_setup_script_tier2(self) -> None:
        from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
        agent = AcquirerAgent(FakeLLM())
        script = agent._generate_setup_script(
            [{"name": "IMDB", "tier": 2,
              "api": "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')"}],
            [],
        )
        assert "download_datasets" in script
        assert "load_dataset" in script

    def test_generate_requirements_filters_builtin(self) -> None:
        from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
        agent = AcquirerAgent(FakeLLM())
        reqs = agent._generate_requirements(["torch", "numpy", "xgboost", "timm"])
        assert "torch" not in reqs
        assert "numpy" not in reqs
        assert "timm" not in reqs
        assert "xgboost" in reqs

    def test_strip_fences(self) -> None:
        from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
        code = "```python\nimport torch\n```"
        assert AcquirerAgent._strip_fences(code) == "import torch"

    def test_execute_generates_code(self) -> None:
        from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent
        llm = FakeLLM([
            "import torchvision\ndef get_datasets(): pass",
            "import torch.nn as nn\ndef get_baselines(): pass",
        ])
        agent = AcquirerAgent(llm)
        result = agent.execute({
            "topic": "Image Classification",
            "selection": {
                "selected_benchmarks": [
                    {"name": "CIFAR-10", "tier": 1, "role": "primary",
                     "api": "torchvision.datasets.CIFAR10(...)"},
                ],
                "selected_baselines": [
                    {"name": "ResNet-18", "source": "torchvision.models.resnet18()",
                     "paper": "He et al.", "pip": []},
                ],
                "required_pip": [],
            },
        })
        assert result.success
        assert result.data["data_loader_code"]


# ---------------------------------------------------------------------------
# Validator tests
# ---------------------------------------------------------------------------


class TestValidator:
    """Test ValidatorAgent code validation."""

    def test_syntax_check_valid(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        errors = agent._check_syntax("import torch\nx = 1 + 2", "test")
        assert errors == []

    def test_syntax_check_invalid(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        errors = agent._check_syntax("def foo(\n  x = ", "test")
        assert len(errors) > 0
        assert "SyntaxError" in errors[0]

    def test_import_check_builtin_ok(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        warnings = agent._check_imports("import torch\nimport numpy", "test", [])
        assert warnings == []

    def test_import_check_unknown(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        warnings = agent._check_imports("import some_obscure_lib", "test", [])
        assert len(warnings) > 0

    def test_import_check_with_requirements(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        warnings = agent._check_imports(
            "import xgboost", "test", ["xgboost"],
        )
        assert warnings == []

    def test_execute_passes_valid_code(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        llm = FakeLLM([json.dumps({
            "passed": True,
            "issues": [],
            "suggestions": [],
            "severity": "none",
        })])
        agent = ValidatorAgent(llm)
        result = agent.execute({
            "acquisition": {
                "data_loader_code": "import torch\ndef get_datasets(): pass",
                "baseline_code": "import torch.nn as nn\ndef get_baselines(): pass",
                "setup_code": "",
                "requirements": "",
                "benchmark_names": ["CIFAR-10"],
                "baseline_names": ["ResNet-18"],
            },
        })
        assert result.success
        assert result.data["passed"]

    def test_execute_fails_syntax_error(self) -> None:
        from researchclaw.agents.benchmark_agent.validator import ValidatorAgent
        agent = ValidatorAgent(FakeLLM())
        result = agent.execute({
            "acquisition": {
                "data_loader_code": "def foo(\n  x = ",
                "baseline_code": "",
                "setup_code": "",
                "requirements": "",
                "benchmark_names": [],
                "baseline_names": [],
            },
        })
        assert not result.data["passed"]
        assert len(result.data["errors"]) > 0


# ---------------------------------------------------------------------------
# Orchestrator tests
# ---------------------------------------------------------------------------


class TestOrchestrator:
    """Test BenchmarkOrchestrator end-to-end."""

    def test_orchestrate_produces_plan(self, tmp_path: Path) -> None:
        from researchclaw.agents.benchmark_agent.orchestrator import (
            BenchmarkAgentConfig,
            BenchmarkOrchestrator,
        )

        responses = [
            # Selector LLM response
            json.dumps({
                "primary_benchmark": "CIFAR-10",
                "secondary_benchmarks": ["CIFAR-100"],
                "selected_baselines": ["ResNet-18", "ViT-B/16"],
                "rationale": "Standard CV benchmarks",
                "experiment_notes": "Use standard augmentation",
            }),
            # Acquirer: data_loader_code
            "import torchvision\ndef get_datasets(data_root='/workspace/data'):\n    return {}",
            # Acquirer: baseline_code
            "import torch.nn as nn\ndef get_baselines(num_classes=10):\n    return {}",
            # Validator: LLM review
            json.dumps({
                "passed": True,
                "issues": [],
                "suggestions": ["Add transforms"],
                "severity": "none",
            }),
        ]

        cfg = BenchmarkAgentConfig(enable_hf_search=False)
        orchestrator = BenchmarkOrchestrator(
            FakeLLM(responses),
            config=cfg,
            stage_dir=tmp_path / "benchmark_agent",
        )
        plan = orchestrator.orchestrate({
            "topic": "Image Classification with Data Augmentation",
            "hypothesis": "Novel augmentation improves accuracy",
        })

        assert len(plan.selected_benchmarks) >= 1
        assert len(plan.selected_baselines) >= 1
        assert plan.validation_passed
        assert plan.total_llm_calls > 0
        assert plan.elapsed_sec > 0

    def test_orchestrate_saves_artifacts(self, tmp_path: Path) -> None:
        from researchclaw.agents.benchmark_agent.orchestrator import (
            BenchmarkAgentConfig,
            BenchmarkOrchestrator,
        )

        responses = [
            json.dumps({
                "primary_benchmark": "CIFAR-10",
                "secondary_benchmarks": [],
                "selected_baselines": ["ResNet-18"],
                "rationale": "test",
                "experiment_notes": "",
            }),
            "def get_datasets(): pass",
            "def get_baselines(): pass",
            json.dumps({"passed": True, "issues": [], "suggestions": [], "severity": "none"}),
        ]

        stage_dir = tmp_path / "benchmark_agent"
        cfg = BenchmarkAgentConfig(enable_hf_search=False)
        orchestrator = BenchmarkOrchestrator(
            FakeLLM(responses),
            config=cfg,
            stage_dir=stage_dir,
        )
        orchestrator.orchestrate({
            "topic": "Image Classification",
            "hypothesis": "",
        })

        assert (stage_dir / "survey_results.json").exists()
        assert (stage_dir / "selection_results.json").exists()
        assert (stage_dir / "benchmark_plan.json").exists()

    def test_plan_to_prompt_block(self) -> None:
        from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan
        plan = BenchmarkPlan(
            selected_benchmarks=[
                {"name": "CIFAR-10", "role": "primary", "metrics": ["accuracy"],
                 "api": "torchvision.datasets.CIFAR10(...)"},
            ],
            selected_baselines=[
                {"name": "ResNet-18", "source": "torchvision.models.resnet18()",
                 "paper": "He et al."},
            ],
            data_loader_code="def get_datasets(): pass",
            baseline_code="def get_baselines(): pass",
        )
        block = plan.to_prompt_block()
        assert "CIFAR-10" in block
        assert "ResNet-18" in block
        assert "get_datasets" in block
        assert "get_baselines" in block

    def test_plan_to_dict_serializable(self) -> None:
        from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan
        plan = BenchmarkPlan(
            selected_benchmarks=[{"name": "test"}],
            data_loader_code="code",
        )
        d = plan.to_dict()
        # Should be JSON-serializable
        json_str = json.dumps(d)
        assert "test" in json_str


# ---------------------------------------------------------------------------
# Config tests
# ---------------------------------------------------------------------------


class TestConfig:
    """Test BenchmarkAgentConfig in config.py."""

    def test_default_config_has_benchmark_agent(self) -> None:
        from researchclaw.config import ExperimentConfig
        cfg = ExperimentConfig()
        assert hasattr(cfg, "benchmark_agent")
        assert cfg.benchmark_agent.enabled is True

    def test_parse_benchmark_agent_config(self) -> None:
        from researchclaw.config import _parse_benchmark_agent_config
        cfg = _parse_benchmark_agent_config({
            "enabled": False,
            "tier_limit": 1,
            "min_baselines": 3,
        })
        assert cfg.enabled is False
        assert cfg.tier_limit == 1
        assert cfg.min_baselines == 3

    def test_parse_benchmark_agent_config_empty(self) -> None:
        from researchclaw.config import _parse_benchmark_agent_config
        cfg = _parse_benchmark_agent_config({})
        assert cfg.enabled is True
        assert cfg.tier_limit == 2


# ---------------------------------------------------------------------------
# Base agent tests
# ---------------------------------------------------------------------------


class TestBaseAgent:
    """Test the base agent class."""

    def test_parse_json_direct(self) -> None:
        from researchclaw.agents.base import BaseAgent
        result = BaseAgent._parse_json('{"key": "value"}')
        assert result == {"key": "value"}

    def test_parse_json_fenced(self) -> None:
        from researchclaw.agents.base import BaseAgent
        result = BaseAgent._parse_json('Some text\n```json\n{"key": 1}\n```\nMore text')
        assert result == {"key": 1}

    def test_parse_json_embedded(self) -> None:
        from researchclaw.agents.base import BaseAgent
        result = BaseAgent._parse_json('Here is the result: {"a": 2} end')
        assert result == {"a": 2}

    def test_parse_json_invalid(self) -> None:
        from researchclaw.agents.base import BaseAgent
        result = BaseAgent._parse_json("no json here at all")
        assert result is None


# ---------------------------------------------------------------------------
# Required baselines injection (Improvement E)
# ---------------------------------------------------------------------------


class TestRequiredBaselines:
    """Test that required baselines are injected from knowledge base."""

    def test_inject_required_baselines_image_classification(self) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent

        llm = FakeLLM()
        agent = SelectorAgent(llm, min_baselines=1)
        selected: list[dict[str, Any]] = [
            {"name": "EfficientNet-B0", "origin": "knowledge_base"},
        ]
        injected = agent._inject_required_baselines(
            "image classification on CIFAR-10",
            selected,
            [],
        )
        # Should inject ResNet-50 and ViT-B/16 (required for image_classification)
        injected_names = {b["name"] for b in injected}
        assert "ResNet-50" in injected_names
        assert "ViT-B/16" in injected_names
        # Already-present baselines should not be duplicated
        assert sum(1 for b in selected if b["name"] == "EfficientNet-B0") == 1

    def test_inject_required_baselines_no_duplicates(self) -> None:
        from researchclaw.agents.benchmark_agent.selector import SelectorAgent

        llm = FakeLLM()
        agent = SelectorAgent(llm, min_baselines=1)
        selected: list[dict[str, Any]] = [
            {"name": "ResNet-50", "origin": "knowledge_base"},
            {"name": "ViT-B/16", "origin": "llm_suggestion"},
        ]
        injected = agent._inject_required_baselines(
            "image classification on CIFAR-10",
            selected,
            [],
        )
        # Both are already present → nothing should be injected
        assert len(injected) == 0


================================================
FILE: tests/test_calendar.py
================================================
"""Tests for researchclaw.calendar — Conference Deadline Calendar (Agent D4).

15+ tests covering deadlines, planner, and reminder modules.
"""

from __future__ import annotations

from datetime import date, timedelta
from pathlib import Path

import pytest
import yaml

from researchclaw.calendar.deadlines import Conference, ConferenceCalendar
from researchclaw.calendar.planner import SubmissionPlanner
from researchclaw.calendar.reminder import Reminder, ReminderCalculator


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


def _make_conference(
    name: str = "TestConf",
    full_name: str = "Test Conference",
    domains: tuple[str, ...] = ("ml",),
    tier: int = 1,
    abstract_deadline: date | None = None,
    paper_deadline: date | None = None,
    **kwargs,
) -> Conference:
    return Conference(
        name=name,
        full_name=full_name,
        domains=domains,
        tier=tier,
        abstract_deadline=abstract_deadline,
        paper_deadline=paper_deadline,
        **kwargs,
    )


def _future(days: int) -> date:
    return date.today() + timedelta(days=days)


def _past(days: int) -> date:
    return date.today() - timedelta(days=days)


# ===================================================================
# Conference dataclass tests
# ===================================================================


class TestConference:
    def test_from_dict_minimal(self):
        data = {"name": "NeurIPS"}
        conf = Conference.from_dict(data)
        assert conf.name == "NeurIPS"
        assert conf.tier == 3  # default
        assert conf.domains == ()

    def test_from_dict_full(self):
        data = {
            "name": "ICML",
            "full_name": "International Conference on Machine Learning",
            "domains": ["ml", "ai"],
            "tier": 1,
            "url": "https://icml.cc",
            "abstract_deadline": "2026-06-01",
            "paper_deadline": "2026-06-08",
        }
        conf = Conference.from_dict(data)
        assert conf.name == "ICML"
        assert conf.full_name == "International Conference on Machine Learning"
        assert conf.domains == ("ml", "ai")
        assert conf.tier == 1
        assert conf.abstract_deadline == date(2026, 6, 1)
        assert conf.paper_deadline == date(2026, 6, 8)

    def test_from_dict_date_passthrough(self):
        """date objects in YAML are already date instances."""
        data = {
            "name": "X",
            "abstract_deadline": date(2026, 12, 1),
        }
        conf = Conference.from_dict(data)
        assert conf.abstract_deadline == date(2026, 12, 1)

    def test_next_deadline_returns_earliest_future(self):
        conf = _make_conference(
            abstract_deadline=_future(10),
            paper_deadline=_future(20),
        )
        assert conf.next_deadline == _future(10)

    def test_next_deadline_skips_past(self):
        conf = _make_conference(
            abstract_deadline=_past(5),
            paper_deadline=_future(15),
        )
        assert conf.next_deadline == _future(15)

    def test_next_deadline_none_when_all_past(self):
        conf = _make_conference(
            abstract_deadline=_past(10),
            paper_deadline=_past(5),
        )
        assert conf.next_deadline is None

    def test_days_until_deadline(self):
        conf = _make_conference(paper_deadline=_future(30))
        assert conf.days_until_deadline == 30

    def test_days_until_deadline_none(self):
        conf = _make_conference()
        assert conf.days_until_deadline is None


# ===================================================================
# ConferenceCalendar tests
# ===================================================================


class TestConferenceCalendar:
    def test_load_from_yaml(self, tmp_path: Path):
        data = {
            "conferences": [
                {
                    "name": "TestConf",
                    "domains": ["ml"],
                    "tier": 1,
                    "paper_deadline": (_future(30)).isoformat(),
                },
                {
                    "name": "TestConf2",
                    "domains": ["cv"],
                    "tier": 2,
                    "paper_deadline": (_future(60)).isoformat(),
                },
            ]
        }
        yaml_path = tmp_path / "conferences.yaml"
        yaml_path.write_text(yaml.dump(data), encoding="utf-8")

        cal = ConferenceCalendar.load(yaml_path)
        assert len(cal.conferences) == 2
        assert cal.conferences[0].name == "TestConf"

    def test_load_skips_invalid_entries(self, tmp_path: Path):
        data = {
            "conferences": [
                {"name": "Valid", "tier": 1},
                {"invalid": "no name field"},
            ]
        }
        yaml_path = tmp_path / "conf.yaml"
        yaml_path.write_text(yaml.dump(data), encoding="utf-8")
        cal = ConferenceCalendar.load(yaml_path)
        assert len(cal.conferences) == 1

    def test_get_upcoming_filters_by_days(self):
        confs = [
            _make_conference(name="Soon", paper_deadline=_future(10)),
            _make_conference(name="Far", paper_deadline=_future(200)),
        ]
        cal = ConferenceCalendar(confs)
        upcoming = cal.get_upcoming(days=90)
        assert len(upcoming) == 1
        assert upcoming[0].name == "Soon"

    def test_get_upcoming_filters_by_domain(self):
        confs = [
            _make_conference(name="ML", domains=("ml",), paper_deadline=_future(10)),
            _make_conference(name="CV", domains=("cv",), paper_deadline=_future(10)),
        ]
        cal = ConferenceCalendar(confs)
        result = cal.get_upcoming(domains=["ml"], days=90)
        assert len(result) == 1
        assert result[0].name == "ML"

    def test_get_upcoming_filters_by_tier(self):
        confs = [
            _make_conference(name="T1", tier=1, paper_deadline=_future(10)),
            _make_conference(name="T3", tier=3, paper_deadline=_future(10)),
        ]
        cal = ConferenceCalendar(confs)
        result = cal.get_upcoming(tier=1, days=90)
        assert len(result) == 1
        assert result[0].name == "T1"

    def test_get_by_name_case_insensitive(self):
        confs = [_make_conference(name="NeurIPS")]
        cal = ConferenceCalendar(confs)
        assert cal.get_by_name("neurips") is not None
        assert cal.get_by_name("NEURIPS") is not None
        assert cal.get_by_name("nonexistent") is None

    def test_get_by_domain(self):
        confs = [
            _make_conference(name="A", domains=("ml", "ai")),
            _make_conference(name="B", domains=("cv",)),
        ]
        cal = ConferenceCalendar(confs)
        assert len(cal.get_by_domain("ml")) == 1
        assert len(cal.get_by_domain("cv")) == 1
        assert len(cal.get_by_domain("nlp")) == 0

    def test_format_upcoming_no_deadlines(self):
        cal = ConferenceCalendar([])
        output = cal.format_upcoming()
        assert "No upcoming deadlines" in output

    def test_format_upcoming_with_deadlines(self):
        confs = [_make_conference(
            name="ICML", paper_deadline=_future(15), url="https://icml.cc"
        )]
        cal = ConferenceCalendar(confs)
        output = cal.format_upcoming(days=90)
        assert "ICML" in output
        assert "15 days left" in output
        assert "https://icml.cc" in output

    def test_load_builtin(self):
        """Built-in conferences.yaml should load without error."""
        cal = ConferenceCalendar.load_builtin()
        assert isinstance(cal.conferences, list)


# ===================================================================
# SubmissionPlanner tests
# ===================================================================


class TestSubmissionPlanner:
    def test_plan_basic(self):
        conf = _make_conference(name="TestConf", paper_deadline=_future(100))
        cal = ConferenceCalendar([conf])
        planner = SubmissionPlanner(cal)
        plan = planner.plan("TestConf", start_date=date.today())
        assert plan["venue"] == "TestConf"
        assert plan["total_days"] == 100
        assert len(plan["milestones"]) == 8  # 8 stages in STAGE_PROPORTIONS

    def test_plan_unknown_venue(self):
        cal = ConferenceCalendar([])
        planner = SubmissionPlanner(cal)
        result = planner.plan("NonExistent")
        assert "error" in result

    def test_plan_past_deadline(self):
        conf = _make_conference(name="Past", paper_deadline=_past(5))
        cal = ConferenceCalendar([conf])
        planner = SubmissionPlanner(cal)
        result = planner.plan("Past", start_date=date.today())
        assert "error" in result
        assert "passed" in result["error"]

    def test_format_plan(self):
        conf = _make_conference(name="ICML", paper_deadline=_future(60))
        cal = ConferenceCalendar([conf])
        planner = SubmissionPlanner(cal)
        output = planner.format_plan("ICML", start_date=date.today())
        assert "Submission Plan for ICML" in output
        assert "Milestones:" in output

    def test_format_plan_error(self):
        cal = ConferenceCalendar([])
        planner = SubmissionPlanner(cal)
        output = planner.format_plan("None")
        assert "Error:" in output


# ===================================================================
# ReminderCalculator tests
# ===================================================================


class TestReminderCalculator:
    def test_check_fires_on_matching_day(self):
        deadline = date.today() + timedelta(days=7)
        conf = _make_conference(name="Conf", paper_deadline=deadline)
        calc = ReminderCalculator(reminder_days=(7,))
        reminders = calc.check([conf])
        assert len(reminders) == 1
        assert reminders[0].days_until == 7

    def test_check_no_fire_on_non_matching_day(self):
        deadline = date.today() + timedelta(days=8)
        conf = _make_conference(name="Conf", paper_deadline=deadline)
        calc = ReminderCalculator(reminder_days=(7,))
        reminders = calc.check([conf])
        assert len(reminders) == 0

    def test_check_skips_past_deadlines(self):
        conf = _make_conference(name="Conf", paper_deadline=_past(3))
        calc = ReminderCalculator(reminder_days=(3,))
        assert len(calc.check([conf])) == 0

    def test_urgency_critical(self):
        assert ReminderCalculator._classify_urgency(1) == "critical"
        assert ReminderCalculator._classify_urgency(3) == "critical"

    def test_urgency_warning(self):
        assert ReminderCalculator._classify_urgency(7) == "warning"
        assert ReminderCalculator._classify_urgency(14) == "warning"

    def test_urgency_info(self):
        assert ReminderCalculator._classify_urgency(30) == "info"

    def test_get_active_reminders(self):
        confs = [
            _make_conference(name="Soon", paper_deadline=_future(5)),
            _make_conference(name="Far", paper_deadline=_future(100)),
        ]
        calc = ReminderCalculator(reminder_days=(30, 14, 7, 3, 1))
        active = calc.get_active_reminders(confs)
        assert len(active) == 1
        assert active[0].conference_name == "Soon"

    def test_format_reminders_empty(self):
        calc = ReminderCalculator()
        assert "No upcoming" in calc.format_reminders([])

    def test_format_reminders_with_data(self):
        r = Reminder(
            conference_name="ICML",
            deadline_type="paper",
            deadline_date=_future(3),
            days_until=3,
            urgency="critical",
        )
        calc = ReminderCalculator()
        output = calc.format_reminders([r])
        assert "ICML" in output
        assert "!!!" in output

    def test_reminder_frozen(self):
        r = Reminder("X", "paper", date.today(), 5, "info")
        with pytest.raises(AttributeError):
            r.days_until = 10  # type: ignore[misc]


================================================
FILE: tests/test_cli.py
================================================
"""Tests for CLI setup helpers."""

from __future__ import annotations

from unittest.mock import MagicMock, patch

from researchclaw import cli


def test_install_opencode_uses_which_resolved_npm_path():
    mock_result = MagicMock()
    mock_result.returncode = 0

    with patch(
        "researchclaw.cli.shutil.which",
        return_value=r"C:\Program Files\nodejs\npm.cmd",
    ), patch("researchclaw.cli.subprocess.run", return_value=mock_result) as run_mock:
        assert cli._install_opencode() is True

    run_mock.assert_called_once()
    assert run_mock.call_args.args[0][0] == r"C:\Program Files\nodejs\npm.cmd"


def test_install_opencode_returns_false_when_npm_missing():
    with patch("researchclaw.cli.shutil.which", return_value=None):
        assert cli._install_opencode() is False


def test_is_opencode_installed_uses_which_resolved_path():
    mock_result = MagicMock()
    mock_result.returncode = 0

    with patch(
        "researchclaw.cli.shutil.which",
        return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd",
    ), patch("researchclaw.cli.subprocess.run", return_value=mock_result) as run_mock:
        assert cli._is_opencode_installed() is True

    run_mock.assert_called_once()
    assert run_mock.call_args.args[0][0].endswith("opencode.cmd")


================================================
FILE: tests/test_code_agent.py
================================================
"""Tests for the advanced multi-phase code generation agent (F-02)."""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import pytest

from researchclaw.llm.client import LLMResponse
from researchclaw.pipeline.code_agent import (
    CodeAgent,
    CodeAgentConfig,
    CodeAgentResult,
    SolutionNode,
    _SimpleResult,
)
from researchclaw.prompts import PromptManager


# ---------------------------------------------------------------------------
# Test fixtures
# ---------------------------------------------------------------------------


class FakeLLM:
    """Fake LLM client that returns configurable responses."""

    def __init__(self, responses: list[str] | None = None):
        self.calls: list[dict[str, Any]] = []
        self._responses = list(responses or [])
        self._call_idx = 0

    def chat(self, messages: list[dict], **kwargs: Any) -> LLMResponse:
        self.calls.append({"messages": messages, **kwargs})
        if self._responses:
            text = self._responses[min(self._call_idx, len(self._responses) - 1)]
        else:
            text = '```filename:main.py\nprint("hello")\n```'
        self._call_idx += 1
        return LLMResponse(content=text, model="fake-model")


@dataclass
class FakeSandboxResult:
    returncode: int = 0
    stdout: str = "primary_metric: 0.95"
    stderr: str = ""
    elapsed_sec: float = 1.0
    metrics: dict[str, object] = field(default_factory=dict)
    timed_out: bool = False


class FakeSandbox:
    """Fake sandbox for testing."""

    def __init__(self, results: list[FakeSandboxResult] | None = None):
        self.runs: list[Path] = []
        self._results = list(results or [FakeSandboxResult()])
        self._run_idx = 0

    def run_project(
        self, project_dir: Path, *, entry_point: str = "main.py",
        timeout_sec: int = 300,
    ) -> FakeSandboxResult:
        self.runs.append(project_dir)
        result = self._results[min(self._run_idx, len(self._results) - 1)]
        self._run_idx += 1
        return result


@pytest.fixture()
def stage_dir(tmp_path: Path) -> Path:
    d = tmp_path / "stage-10"
    d.mkdir()
    return d


@pytest.fixture()
def pm() -> PromptManager:
    return PromptManager()


# ---------------------------------------------------------------------------
# CodeAgentConfig tests
# ---------------------------------------------------------------------------


class TestCodeAgentConfig:
    def test_default_values(self) -> None:
        cfg = CodeAgentConfig()
        assert cfg.enabled is True
        assert cfg.architecture_planning is True
        assert cfg.exec_fix_max_iterations == 3
        assert cfg.tree_search_enabled is False
        assert cfg.review_max_rounds == 2

    def test_custom_values(self) -> None:
        cfg = CodeAgentConfig(
            enabled=False,
            exec_fix_max_iterations=5,
            tree_search_enabled=True,
            tree_search_candidates=5,
        )
        assert cfg.enabled is False
        assert cfg.exec_fix_max_iterations == 5
        assert cfg.tree_search_enabled is True
        assert cfg.tree_search_candidates == 5


# ---------------------------------------------------------------------------
# Phase 1: Architecture Planning
# ---------------------------------------------------------------------------


class TestPhase1Architecture:
    def test_architecture_planning_produces_spec(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        arch_yaml = (
            "```yaml\nfiles:\n  - name: main.py\n    purpose: entry point\n"
            "  - name: models.py\n    purpose: models\n```"
        )
        code = '```filename:main.py\nprint("metric: 1.0")\n```'
        # reviewer approves immediately
        review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}'
        llm = FakeLLM(responses=[arch_yaml, code, review])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(architecture_planning=True),
            stage_dir=stage_dir,
        )
        result = agent.generate(
            topic="test topic", exp_plan="objectives: test",
            metric="accuracy", pkg_hint="numpy, torch",
        )

        assert result.architecture_spec
        assert "main.py" in result.architecture_spec
        assert result.files
        assert result.total_llm_calls >= 2  # arch + codegen + review

    def test_architecture_planning_disabled(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("metric: 1.0")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[code, review])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(architecture_planning=False),
            stage_dir=stage_dir,
        )
        result = agent.generate(
            topic="test", exp_plan="plan", metric="m", pkg_hint="",
        )

        assert result.architecture_spec == ""
        assert result.files
        # First call should be code_generation, not the architecture planning prompt
        first_call_user = llm.calls[0]["messages"][0]["content"]
        # The architecture planning prompt has "Design the architecture" phrasing
        assert "design the architecture for an experiment" not in first_call_user.lower()


# ---------------------------------------------------------------------------
# Phase 2: Execution-in-the-Loop
# ---------------------------------------------------------------------------


class TestPhase2ExecFix:
    def test_exec_fix_loop_fixes_crashing_code(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        # Initial code crashes, then fix succeeds
        initial_code = '```filename:main.py\nraise RuntimeError("bug")\n```'
        fixed_code = '```filename:main.py\nprint("metric: 1.0")\n```'
        review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}'
        llm = FakeLLM(responses=[
            initial_code,  # phase 2: initial generation (no arch)
            fixed_code,    # phase 2: exec-fix iteration
            review,        # phase 4: review
        ])

        sandbox_results = [
            FakeSandboxResult(returncode=1, stderr="RuntimeError: bug"),
            FakeSandboxResult(returncode=0, stdout="metric: 1.0"),
        ]
        fake_sandbox = FakeSandbox(results=sandbox_results)

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                exec_fix_max_iterations=3,
            ),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: fake_sandbox,
            experiment_config=None,
        )
        result = agent.generate(
            topic="test", exp_plan="plan", metric="metric", pkg_hint="",
        )

        assert result.files
        assert result.total_sandbox_runs >= 1

    def test_exec_fix_skipped_without_sandbox(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("m: 1")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[code, review])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(architecture_planning=False),
            stage_dir=stage_dir,
            sandbox_factory=None,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="m", pkg_hint="",
        )

        assert result.total_sandbox_runs == 0
        assert result.files

    def test_exec_fix_max_iterations_respected(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nraise RuntimeError("persistent")\n```'
        review = '{"verdict": "APPROVE", "score": 5, "critical_issues": []}'
        llm = FakeLLM(responses=[code, code, code, code, review])

        always_crash = FakeSandbox(
            results=[FakeSandboxResult(returncode=1, stderr="RuntimeError")]
        )

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                exec_fix_max_iterations=2,
            ),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: always_crash,
            experiment_config=None,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="m", pkg_hint="",
        )

        # Should have exactly 2 sandbox runs (max iterations)
        assert result.total_sandbox_runs == 2


# ---------------------------------------------------------------------------
# Phase 3: Solution Tree Search
# ---------------------------------------------------------------------------


class TestPhase3TreeSearch:
    def test_tree_search_generates_multiple_candidates(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code_a = '```filename:main.py\nprint("metric: 0.5")\n```'
        code_b = '```filename:main.py\nprint("metric: 0.9")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[code_a, code_b, review])

        sandbox = FakeSandbox(results=[
            FakeSandboxResult(returncode=0, stdout="metric: 0.5",
                              metrics={"metric": 0.5}),
            FakeSandboxResult(returncode=0, stdout="metric: 0.9",
                              metrics={"metric": 0.9}),
        ])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                tree_search_enabled=True,
                tree_search_candidates=2,
                tree_search_max_depth=1,
            ),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: sandbox,
            experiment_config=None,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="metric", pkg_hint="",
        )

        assert result.tree_nodes_explored >= 2
        assert result.files

    def test_tree_search_fixes_crashing_candidates(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        crash_code = '```filename:main.py\nraise ValueError("x")\n```'
        fixed_code = '```filename:main.py\nprint("metric: 1.0")\n```'
        review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}'
        llm = FakeLLM(responses=[
            crash_code,    # candidate 0
            crash_code,    # candidate 1
            fixed_code,    # fix for candidate 0
            fixed_code,    # fix for candidate 1
            review,        # review
        ])

        results_seq = [
            FakeSandboxResult(returncode=1, stderr="ValueError: x"),
            FakeSandboxResult(returncode=1, stderr="ValueError: x"),
            FakeSandboxResult(returncode=0, stdout="metric: 1.0"),
            FakeSandboxResult(returncode=0, stdout="metric: 1.0"),
        ]
        sandbox = FakeSandbox(results=results_seq)

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                tree_search_enabled=True,
                tree_search_candidates=2,
                tree_search_max_depth=2,
            ),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: sandbox,
            experiment_config=None,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="metric", pkg_hint="",
        )

        assert result.tree_nodes_explored >= 2


# ---------------------------------------------------------------------------
# Phase 4: Multi-Agent Review
# ---------------------------------------------------------------------------


class TestPhase4Review:
    def test_review_approves_on_first_round(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("m: 1")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[code, review])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                review_max_rounds=2,
            ),
            stage_dir=stage_dir,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="m", pkg_hint="",
        )

        assert result.review_rounds == 1

    def test_review_triggers_fix_on_critical_issues(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("m: 1")\n```'
        review1 = json.dumps({
            "verdict": "REVISE",
            "score": 3,
            "critical_issues": ["Missing seed handling", "Wrong metric name"],
            "suggestions": [],
        })
        fixed = '```filename:main.py\nimport random\nrandom.seed(42)\nprint("m: 1")\n```'
        review2 = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}'
        llm = FakeLLM(responses=[code, review1, fixed, review2])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                review_max_rounds=3,
                hard_validation=False,  # Test focuses on review, not validation
            ),
            stage_dir=stage_dir,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="m", pkg_hint="",
        )

        assert result.review_rounds == 2
        assert result.total_llm_calls == 4  # codegen + review1 + fix + review2

    def test_review_disabled(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("m: 1")\n```'
        llm = FakeLLM(responses=[code])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=False,
                review_max_rounds=0,
                hard_validation=False,  # Test focuses on review, not validation
            ),
            stage_dir=stage_dir,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="m", pkg_hint="",
        )

        assert result.review_rounds == 0
        assert result.total_llm_calls == 1  # only codegen


# ---------------------------------------------------------------------------
# Full pipeline tests
# ---------------------------------------------------------------------------


class TestFullPipeline:
    def test_all_phases_end_to_end(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        arch = "```yaml\nfiles:\n  - name: main.py\n```"
        code = '```filename:main.py\nprint("acc: 0.9")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[arch, code, review])

        sandbox = FakeSandbox(results=[
            FakeSandboxResult(returncode=0, stdout="acc: 0.9"),
        ])

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(
                architecture_planning=True,
                exec_fix_max_iterations=2,
                review_max_rounds=1,
            ),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: sandbox,
            experiment_config=None,
        )
        result = agent.generate(
            topic="image classification", exp_plan="test plan",
            metric="accuracy", pkg_hint="torch",
        )

        assert result.architecture_spec
        assert "main.py" in result.files
        assert result.total_llm_calls >= 3  # arch + code + review
        assert result.total_sandbox_runs >= 1
        assert result.review_rounds == 1
        assert result.validation_log

    def test_agent_writes_attempt_directories(
        self, stage_dir: Path, pm: PromptManager,
    ) -> None:
        code = '```filename:main.py\nprint("x: 1")\n```'
        review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}'
        llm = FakeLLM(responses=[code, review])

        sandbox = FakeSandbox()

        agent = CodeAgent(
            llm=llm, prompts=pm,
            config=CodeAgentConfig(architecture_planning=False),
            stage_dir=stage_dir,
            sandbox_factory=lambda cfg, wd: sandbox,
            experiment_config=None,
        )
        result = agent.generate(
            topic="t", exp_plan="p", metric="x", pkg_hint="",
        )

        attempt_dir = stage_dir / "agent_runs" / "attempt_001"
        assert attempt_dir.exists()
        assert (attempt_dir / "main.py").exists()


# ---------------------------------------------------------------------------
# SolutionNode and scoring
# ---------------------------------------------------------------------------


class TestSolutionNodeScoring:
    def test_score_running_node(self) -> None:
        node = SolutionNode(
            node_id="test",
            files={"main.py": "x"},
            runs_ok=True,
            stdout="lots of output " * 20,
            metrics={"metric": 0.95},
        )
        score = CodeAgent._score_node(node, "metric")
        assert score >= 2.0  # runs_ok(1.0) + output(0.3) + metrics(0.5) + key(0.5)

    def test_score_crashing_node(self) -> None:
        node = SolutionNode(
            node_id="test",
            files={"main.py": "x"},
            runs_ok=False,
            stderr="Error: something broke",
        )
        score = CodeAgent._score_node(node, "metric")
        assert score == 0.0  # no runs_ok, error penalty, max(0)

    def test_score_partial_output(self) -> None:
        node = SolutionNode(
            node_id="test",
            files={"main.py": "x"},
            runs_ok=True,
            stdout="short",
            metrics={},
        )
        score = CodeAgent._score_node(node, "metric")
        assert score == 1.0  # only runs_ok


# ---------------------------------------------------------------------------
# Helper methods
# ---------------------------------------------------------------------------


class TestHelpers:
    def test_format_files(self) -> None:
        files = {"main.py": "print(1)", "utils.py": "x = 2"}
        formatted = CodeAgent._format_files(files)
        assert "```filename:main.py" in formatted
        assert "```filename:utils.py" in formatted
        assert "print(1)" in formatted

    def test_parse_json_direct(self) -> None:
        result = CodeAgent._parse_json('{"score": 5}')
        assert result == {"score": 5}

    def test_parse_json_fenced(self) -> None:
        text = 'Some text\n```json\n{"verdict": "APPROVE"}\n```\nmore text'
        result = CodeAgent._parse_json(text)
        assert result == {"verdict": "APPROVE"}

    def test_parse_json_embedded(self) -> None:
        text = 'The review is: {"score": 7, "verdict": "REVISE"} end'
        result = CodeAgent._parse_json(text)
        assert result is not None
        assert result["score"] == 7

    def test_parse_json_invalid(self) -> None:
        result = CodeAgent._parse_json("not json at all")
        assert result is None

    def test_simple_result_defaults(self) -> None:
        r = _SimpleResult()
        assert r.returncode == 1
        assert r.stdout == ""
        assert r.timed_out is False


# ---------------------------------------------------------------------------
# Config integration test
# ---------------------------------------------------------------------------


class TestConfigIntegration:
    def test_code_agent_config_in_experiment_config(self) -> None:
        from researchclaw.config import CodeAgentConfig, ExperimentConfig

        exp = ExperimentConfig()
        assert hasattr(exp, "code_agent")
        assert isinstance(exp.code_agent, CodeAgentConfig)
        assert exp.code_agent.enabled is True

    def test_code_agent_config_from_dict(self, tmp_path: Path) -> None:
        from researchclaw.config import RCConfig

        data = {
            "project": {"name": "test", "mode": "docs-first"},
            "research": {
                "topic": "test",
                "domains": ["ml"],
                "daily_paper_count": 1,
                "quality_threshold": 7.0,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {
                "backend": "markdown",
                "root": str(tmp_path / "kb"),
            },
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "TEST",
                "api_key": "test-key",
                "primary_model": "test",
                "fallback_models": [],
            },
            "experiment": {
                "mode": "sandbox",
                "code_agent": {
                    "enabled": False,
                    "tree_search_enabled": True,
                    "tree_search_candidates": 5,
                },
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        assert cfg.experiment.code_agent.enabled is False
        assert cfg.experiment.code_agent.tree_search_enabled is True
        assert cfg.experiment.code_agent.tree_search_candidates == 5


# ---------------------------------------------------------------------------
# Prompts integration test
# ---------------------------------------------------------------------------


class TestPromptsIntegration:
    def test_architecture_planning_prompt_exists(self, pm: PromptManager) -> None:
        sp = pm.sub_prompt(
            "architecture_planning",
            topic="image classification",
            exp_plan="test plan",
            metric="accuracy",
        )
        assert "architect" in sp.system.lower()
        assert "accuracy" in sp.user
        assert "image classification" in sp.user

    def test_code_exec_fix_prompt_exists(self, pm: PromptManager) -> None:
        sp = pm.sub_prompt(
            "code_exec_fix",
            stderr="ImportError: no module named foo",
            stdout_tail="loading data...",
            returncode="1",
            files_context="```filename:main.py\nimport foo\n```",
        )
        assert "debug" in sp.system.lower() or "fix" in sp.system.lower()
        assert "ImportError" in sp.user

    def test_code_reviewer_prompt_exists(self, pm: PromptManager) -> None:
        sp = pm.sub_prompt(
            "code_reviewer",
            topic="RL",
            exp_plan="test plan",
            metric="reward",
            files_context="```filename:main.py\nprint('hi')\n```",
        )
        assert "review" in sp.system.lower()
        assert "reward" in sp.user
        assert "APPROVE" in sp.user or "REVISE" in sp.user


================================================
FILE: tests/test_code_searcher.py
================================================
"""Tests for the Code Searcher agent."""

from __future__ import annotations

import json
import time
import pytest
from pathlib import Path
from unittest.mock import MagicMock, patch

from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult
from researchclaw.agents.code_searcher.cache import SearchCache
from researchclaw.agents.code_searcher.github_client import (
    CodeSnippet,
    GitHubClient,
    RepoAnalysis,
    RepoInfo,
)
from researchclaw.agents.code_searcher.pattern_extractor import (
    CodePatterns,
    extract_patterns,
    _heuristic_extract,
)
from researchclaw.agents.code_searcher.query_gen import (
    generate_search_queries,
    _heuristic_generate,
    _extract_key_phrases,
)
from researchclaw.domains.detector import DomainProfile, get_profile


# ---------------------------------------------------------------------------
# Query Generation tests
# ---------------------------------------------------------------------------


class TestQueryGeneration:
    def test_heuristic_generates_queries(self):
        queries = _heuristic_generate(
            topic="finite element method for Poisson equation",
            domain_name="PDE Solvers",
            libraries=["numpy", "scipy", "fenics"],
            needs=["FEM assembly", "mesh generation"],
        )
        assert len(queries) >= 3
        assert len(queries) <= 5
        # Should include library names
        any_lib = any("numpy" in q or "scipy" in q or "fenics" in q for q in queries)
        assert any_lib

    def test_heuristic_no_duplicates(self):
        queries = _heuristic_generate(
            topic="simple test",
            domain_name="Test",
            libraries=["numpy"],
            needs=[],
        )
        # No exact duplicates
        assert len(queries) == len(set(q.lower().strip() for q in queries))

    def test_extract_key_phrases(self):
        result = _extract_key_phrases("A Novel Approach for Image Classification Using Deep Learning")
        # Should remove filler words
        assert "novel" not in result.lower()
        assert "using" not in result.lower()

    def test_generate_without_llm(self):
        queries = generate_search_queries(
            topic="molecular dynamics simulation",
            domain_name="Computational Physics",
            core_libraries=["jax", "numpy"],
            llm=None,
        )
        assert isinstance(queries, list)
        assert len(queries) >= 2


# ---------------------------------------------------------------------------
# Pattern Extractor tests
# ---------------------------------------------------------------------------


class TestPatternExtractor:
    def test_heuristic_extract_imports(self):
        snippets = [
            "import numpy as np\nimport scipy.sparse as sp\n\ndef solve():\n    pass",
            "from pyscf import gto, scf\nmol = gto.M(atom='H 0 0 0')",
        ]
        patterns = _heuristic_extract(snippets)
        assert len(patterns.api_patterns) > 0
        assert any("numpy" in p for p in patterns.api_patterns)

    def test_heuristic_extract_functions(self):
        snippets = [
            "class Solver:\n    pass\ndef solve_pde():\n    pass\ndef analyze():\n    pass",
        ]
        patterns = _heuristic_extract(snippets)
        assert len(patterns.file_structure) > 0

    def test_empty_snippets(self):
        patterns = extract_patterns([], topic="test", domain_name="test")
        assert not patterns.has_content

    def test_code_patterns_to_prompt(self):
        patterns = CodePatterns(
            api_patterns=["import numpy as np\nresult = np.linalg.solve(A, b)"],
            file_structure={"solver.py": "Main solver implementation"},
            evaluation_patterns=["error = np.linalg.norm(x - x_exact)"],
        )
        ctx = patterns.to_prompt_context()
        assert "numpy" in ctx
        assert "solver.py" in ctx
        assert "error" in ctx

    def test_code_patterns_has_content(self):
        empty = CodePatterns()
        assert not empty.has_content

        with_data = CodePatterns(api_patterns=["import x"])
        assert with_data.has_content


# ---------------------------------------------------------------------------
# Search Cache tests
# ---------------------------------------------------------------------------


class TestSearchCache:
    def test_put_and_get(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path, ttl_days=30)
        data = {"api_patterns": ["import numpy"], "repos": []}
        cache.put("ml_vision", "image classification", data)

        result = cache.get("ml_vision", "image classification")
        assert result is not None
        assert result["api_patterns"] == ["import numpy"]

    def test_cache_miss(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path)
        result = cache.get("unknown", "unknown topic")
        assert result is None

    def test_cache_expiry(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path, ttl_days=0)  # immediate expiry
        data = {"test": True}
        cache.put("test", "topic", data)

        # Manually set old timestamp
        cache_path = tmp_path / "test"
        for f in cache_path.glob("*.json"):
            content = json.loads(f.read_text())
            content["_cached_at"] = time.time() - 86400  # 1 day ago
            f.write_text(json.dumps(content))

        result = cache.get("test", "topic")
        assert result is None  # expired

    def test_clear_domain(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path)
        cache.put("ml_vision", "topic1", {"data": 1})
        cache.put("ml_vision", "topic2", {"data": 2})
        cache.put("physics", "topic3", {"data": 3})

        count = cache.clear("ml_vision")
        assert count == 2
        assert cache.get("ml_vision", "topic1") is None
        assert cache.get("physics", "topic3") is not None

    def test_clear_all(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path)
        cache.put("a", "t1", {"x": 1})
        cache.put("b", "t2", {"x": 2})

        count = cache.clear()
        assert count == 2

    def test_stats(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path)
        cache.put("ml_vision", "t1", {"x": 1})
        cache.put("ml_vision", "t2", {"x": 2})
        cache.put("physics", "t3", {"x": 3})

        stats = cache.stats()
        assert stats["total"] == 3
        assert stats.get("ml_vision", 0) == 2

    def test_topic_hash_deterministic(self):
        h1 = SearchCache._topic_hash("test topic")
        h2 = SearchCache._topic_hash("test topic")
        assert h1 == h2

    def test_topic_hash_case_insensitive(self):
        h1 = SearchCache._topic_hash("Test Topic")
        h2 = SearchCache._topic_hash("test topic")
        assert h1 == h2


# ---------------------------------------------------------------------------
# GitHubClient tests (mocked)
# ---------------------------------------------------------------------------


class TestGitHubClient:
    def test_has_token_false(self):
        with patch.dict("os.environ", {}, clear=True):
            client = GitHubClient(token="")
            # Can't easily clear env, but token="" means no token
            assert not client.has_token

    def test_has_token_true(self):
        client = GitHubClient(token="ghp_test123")
        assert client.has_token

    def test_headers_with_token(self):
        client = GitHubClient(token="ghp_test123")
        headers = client._headers()
        assert "Authorization" in headers
        assert "Bearer" in headers["Authorization"]

    def test_headers_without_token(self):
        client = GitHubClient(token="")
        headers = client._headers()
        assert "Authorization" not in headers


# ---------------------------------------------------------------------------
# RepoInfo / CodeSnippet data class tests
# ---------------------------------------------------------------------------


class TestDataClasses:
    def test_repo_info_defaults(self):
        repo = RepoInfo(full_name="owner/repo")
        assert repo.stars == 0
        assert repo.default_branch == "main"

    def test_code_snippet(self):
        snippet = CodeSnippet(
            repo_full_name="owner/repo",
            file_path="src/main.py",
        )
        assert snippet.content == ""

    def test_repo_analysis(self):
        analysis = RepoAnalysis(
            repo=RepoInfo(full_name="test/repo"),
            readme="# Test Repo",
            requirements=["numpy", "scipy"],
        )
        assert len(analysis.requirements) == 2


# ---------------------------------------------------------------------------
# CodeSearchResult tests
# ---------------------------------------------------------------------------


class TestCodeSearchResult:
    def test_empty_result(self):
        result = CodeSearchResult()
        assert result.to_prompt_context() == ""
        assert not result.from_cache

    def test_result_with_patterns(self):
        result = CodeSearchResult(
            patterns=CodePatterns(
                api_patterns=["import numpy as np"],
                file_structure={"main.py": "Entry point"},
            ),
        )
        ctx = result.to_prompt_context()
        assert "numpy" in ctx

    def test_cache_roundtrip(self):
        result = CodeSearchResult(
            patterns=CodePatterns(
                api_patterns=["import numpy"],
                file_structure={"main.py": "Entry"},
                evaluation_patterns=["error = norm(diff)"],
            ),
            repos_found=[
                RepoInfo(full_name="test/repo", stars=100, html_url="https://example.com"),
            ],
            queries_used=["test query"],
        )
        cache_dict = result.to_cache_dict()
        restored = CodeSearchResult.from_cache_dict(cache_dict)
        assert restored.from_cache
        assert restored.patterns.api_patterns == ["import numpy"]
        assert len(restored.repos_found) == 1
        assert restored.queries_used == ["test query"]


# ---------------------------------------------------------------------------
# CodeSearchAgent tests (mocked GitHub)
# ---------------------------------------------------------------------------


class TestCodeSearchAgent:
    def _mock_github(self):
        """Create a mock GitHub client."""
        mock = MagicMock(spec=GitHubClient)
        mock.search_repos.return_value = [
            RepoInfo(
                full_name="user/physics-sim",
                description="Physics simulation framework",
                stars=500,
                html_url="https://github.com/user/physics-sim",
            ),
        ]
        mock.search_code.return_value = [
            CodeSnippet(
                repo_full_name="user/physics-sim",
                file_path="main.py",
                score=10.0,
            ),
        ]
        mock.get_readme.return_value = "# Physics Simulation\nA framework for physics sims."
        mock.get_repo_tree.return_value = ["main.py", "solver.py", "requirements.txt"]
        mock.get_file_content.return_value = "import numpy as np\ndef solve(): pass"
        mock.request_count = 5
        return mock

    def test_search_uses_cache(self, tmp_path):
        cache = SearchCache(cache_dir=tmp_path)
        cache.put("physics_simulation", "N-body sim", {
            "api_patterns": ["cached pattern"],
            "file_structure": {},
            "evaluation_patterns": [],
            "library_versions": {},
            "repos": [],
            "queries": ["cached query"],
        })

        agent = CodeSearchAgent(cache=cache)
        profile = DomainProfile(
            domain_id="physics_simulation",
            display_name="Physics",
            core_libraries=["numpy"],
        )
        result = agent.search("N-body sim", profile)
        assert result.from_cache
        assert result.patterns.api_patterns == ["cached pattern"]

    def test_search_with_mock_github(self, tmp_path):
        mock_github = self._mock_github()
        cache = SearchCache(cache_dir=tmp_path)

        agent = CodeSearchAgent(cache=cache)
        agent._github = mock_github

        profile = DomainProfile(
            domain_id="physics_simulation",
            display_name="Computational Physics",
            core_libraries=["numpy", "scipy"],
            github_search_terms=["physics simulation python"],
        )
        result = agent.search("molecular dynamics simulation", profile)

        assert not result.from_cache
        assert len(result.queries_used) >= 2
        mock_github.search_repos.assert_called_once()

    def test_search_graceful_failure(self, tmp_path):
        """If GitHub fails, should still return empty result without crashing."""
        mock_github = MagicMock(spec=GitHubClient)
        mock_github.search_repos.side_effect = Exception("Network error")
        mock_github.search_code.side_effect = Exception("Network error")
        mock_github.request_count = 0

        cache = SearchCache(cache_dir=tmp_path)
        agent = CodeSearchAgent(cache=cache)
        agent._github = mock_github

        profile = DomainProfile(
            domain_id="test",
            display_name="Test",
            core_libraries=["numpy"],
        )
        result = agent.search("test topic", profile)
        # Should not crash
        assert isinstance(result, CodeSearchResult)


================================================
FILE: tests/test_collaboration.py
================================================
"""Tests for the collaboration system (15+ tests).

Covers:
- ResearchRepository (publish, search, list)
- ArtifactPublisher (extraction from run dirs)
- ArtifactSubscriber (queries)
- Deduplication (content_hash, deduplicate_artifacts)
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.collaboration.repository import ResearchRepository
from researchclaw.collaboration.publisher import ArtifactPublisher
from researchclaw.collaboration.subscriber import ArtifactSubscriber
from researchclaw.collaboration.dedup import content_hash, deduplicate_artifacts


# ── Fixtures ─────────────────────────────────────────────────────────


@pytest.fixture
def repo(tmp_path: Path) -> ResearchRepository:
    return ResearchRepository(repo_dir=tmp_path / "shared_repo")


@pytest.fixture
def populated_repo(repo: ResearchRepository) -> ResearchRepository:
    repo.publish(
        run_id="run-001",
        artifacts={
            "literature_summary": {"papers": ["Paper A on transformer", "Paper B on vision"]},
            "experiment_results": {"accuracy": 0.95, "model": "ResNet50"},
        },
    )
    repo.publish(
        run_id="run-002",
        artifacts={
            "literature_summary": {"papers": ["Paper C on nlp transformer"]},
            "code_template": "import torch\nmodel = ResNet()\n# pytorch training",
        },
    )
    return repo


@pytest.fixture
def run_dir(tmp_path: Path) -> Path:
    """Create a fake pipeline run directory with stage outputs."""
    d = tmp_path / "run-test"
    d.mkdir()

    # Stage 07 — literature synthesis
    s07 = d / "stage-07-literature_synthesis"
    s07.mkdir()
    (s07 / "synthesis.json").write_text(
        json.dumps({"papers": [{"title": "Test Paper", "year": 2024}]}),
        encoding="utf-8",
    )

    # Stage 10 — code generation
    s10 = d / "stage-10-code_generation"
    s10.mkdir()
    (s10 / "main.py").write_text("print('hello')", encoding="utf-8")

    # Stage 14 — result analysis
    s14 = d / "stage-14-result_analysis"
    s14.mkdir()
    (s14 / "experiment_summary.json").write_text(
        json.dumps({"accuracy": 0.92}), encoding="utf-8"
    )

    # Stage 18 — peer review
    s18 = d / "stage-18-peer_review"
    s18.mkdir()
    (s18 / "review.md").write_text("Good paper overall.", encoding="utf-8")

    return d


# ── Repository Tests ─────────────────────────────────────────────────


class TestResearchRepository:
    def test_publish(self, repo: ResearchRepository) -> None:
        count = repo.publish(
            run_id="run-001",
            artifacts={"literature_summary": {"papers": ["P1"]}},
        )
        assert count == 1

    def test_publish_creates_dirs(self, repo: ResearchRepository) -> None:
        repo.publish(
            run_id="run-new",
            artifacts={"code_template": "print('hi')"},
        )
        assert (repo.repo_dir / "run-new").is_dir()

    def test_publish_unknown_type_skipped(self, repo: ResearchRepository) -> None:
        count = repo.publish(
            run_id="run-bad",
            artifacts={"unknown_type": "data"},
        )
        assert count == 0

    def test_search_by_query(self, populated_repo: ResearchRepository) -> None:
        results = populated_repo.search("transformer")
        assert len(results) >= 2

    def test_search_by_type(self, populated_repo: ResearchRepository) -> None:
        results = populated_repo.search(
            "paper", artifact_type="literature_summary"
        )
        assert len(results) >= 1

    def test_search_no_results(self, populated_repo: ResearchRepository) -> None:
        results = populated_repo.search("quantum_nonexistent_xyz")
        assert len(results) == 0

    def test_search_empty_repo(self, repo: ResearchRepository) -> None:
        results = repo.search("anything")
        assert results == []

    def test_list_runs(self, populated_repo: ResearchRepository) -> None:
        runs = populated_repo.list_runs()
        assert "run-001" in runs
        assert "run-002" in runs

    def test_list_runs_empty(self, repo: ResearchRepository) -> None:
        runs = repo.list_runs()
        assert runs == []

    def test_get_run_artifacts(self, populated_repo: ResearchRepository) -> None:
        artifacts = populated_repo.get_run_artifacts("run-001")
        assert "literature_summary" in artifacts
        assert "experiment_results" in artifacts

    def test_get_run_artifacts_missing(self, populated_repo: ResearchRepository) -> None:
        artifacts = populated_repo.get_run_artifacts("run-999")
        assert artifacts == {}

    def test_import_literature(self, populated_repo: ResearchRepository) -> None:
        lit = populated_repo.import_literature("run-001")
        assert isinstance(lit, list)
        assert len(lit) >= 1

    def test_import_literature_missing_run(self, populated_repo: ResearchRepository) -> None:
        lit = populated_repo.import_literature("run-999")
        assert lit == []

    def test_import_code_template(self, populated_repo: ResearchRepository) -> None:
        code = populated_repo.import_code_template("run-002", "pytorch")
        assert code is not None
        assert "torch" in code

    def test_import_code_template_no_match(self, populated_repo: ResearchRepository) -> None:
        code = populated_repo.import_code_template("run-002", "tensorflow_xyz")
        assert code is None


# ── Publisher Tests ──────────────────────────────────────────────────


class TestArtifactPublisher:
    def test_publish_from_run_dir(self, run_dir: Path, tmp_path: Path) -> None:
        repo = ResearchRepository(repo_dir=tmp_path / "pub_repo")
        publisher = ArtifactPublisher(repo)
        count = publisher.publish_from_run_dir("test-run", run_dir)
        assert count >= 1

    def test_publish_empty_dir(self, tmp_path: Path) -> None:
        empty = tmp_path / "empty_run"
        empty.mkdir()
        repo = ResearchRepository(repo_dir=tmp_path / "pub_repo2")
        publisher = ArtifactPublisher(repo)
        count = publisher.publish_from_run_dir("empty", empty)
        assert count == 0

    def test_publish_nonexistent_dir(self, tmp_path: Path) -> None:
        repo = ResearchRepository(repo_dir=tmp_path / "pub_repo3")
        publisher = ArtifactPublisher(repo)
        count = publisher.publish_from_run_dir("missing", tmp_path / "nope")
        assert count == 0


# ── Subscriber Tests ─────────────────────────────────────────────────


class TestArtifactSubscriber:
    def test_find_relevant_literature(self, populated_repo: ResearchRepository) -> None:
        sub = ArtifactSubscriber(populated_repo)
        results = sub.find_relevant_literature("transformer")
        assert len(results) >= 1

    def test_find_similar_experiments(self, populated_repo: ResearchRepository) -> None:
        sub = ArtifactSubscriber(populated_repo)
        results = sub.find_similar_experiments("resnet")
        assert len(results) >= 1

    def test_find_code_templates(self, populated_repo: ResearchRepository) -> None:
        sub = ArtifactSubscriber(populated_repo)
        results = sub.find_code_templates("pytorch")
        assert len(results) >= 1

    def test_import_best_practices(self, populated_repo: ResearchRepository) -> None:
        sub = ArtifactSubscriber(populated_repo)
        practices = sub.import_best_practices("transformer")
        assert isinstance(practices, str)

    def test_import_best_practices_empty(self, repo: ResearchRepository) -> None:
        sub = ArtifactSubscriber(repo)
        practices = sub.import_best_practices("nonexistent")
        assert practices == ""


# ── Dedup Tests ──────────────────────────────────────────────────────


class TestDedup:
    def test_content_hash_deterministic(self) -> None:
        h1 = content_hash({"a": 1, "b": 2})
        h2 = content_hash({"b": 2, "a": 1})
        assert h1 == h2

    def test_content_hash_different(self) -> None:
        h1 = content_hash({"a": 1})
        h2 = content_hash({"a": 2})
        assert h1 != h2

    def test_deduplicate_artifacts(self) -> None:
        artifacts = [
            {"content": {"x": 1}, "tags": ["a"]},
            {"content": {"x": 1}, "tags": ["b"]},  # duplicate content
            {"content": {"y": 2}, "tags": ["c"]},
        ]
        unique = deduplicate_artifacts(artifacts)
        assert len(unique) == 2

    def test_deduplicate_empty(self) -> None:
        assert deduplicate_artifacts([]) == []


================================================
FILE: tests/test_compiler.py
================================================
"""Tests for researchclaw.templates.compiler — BUG-197 and general compilation.

BUG-197: pdflatex stdout containing broken UTF-8 (from U+202F error messages)
caused UnicodeDecodeError that killed the compilation pipeline, preventing
bibtex from running and leaving all citations as [?].
"""

from __future__ import annotations

import re
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.templates.compiler import (
    CompileResult,
    _is_fatal_error,
    _sanitize_tex_unicode,
    fix_common_latex_errors,
)


# ---------------------------------------------------------------------------
# _is_fatal_error
# ---------------------------------------------------------------------------

class TestIsFatalError:
    """Test that _is_fatal_error correctly classifies errors."""

    def test_unicode_char_not_set_up_is_nonfatal(self):
        """BUG-197: Unicode character errors should be non-fatal.

        The error line captured by _parse_log is a single line:
        ``! LaTeX Error: Unicode character X (U+202F)``
        (the "not set up" text is on a continuation line).
        """
        err = "! LaTeX Error: Unicode character \u202f (U+202F)"
        assert not _is_fatal_error(err)

    def test_unicode_char_various_codepoints_nonfatal(self):
        """Various Unicode character codepoints should be non-fatal."""
        for cp in ["U+00A0", "U+2009", "U+2007", "U+3000"]:
            err = f"! LaTeX Error: Unicode character X ({cp})"
            assert not _is_fatal_error(err), f"Expected non-fatal for {cp}"

    def test_undefined_control_sequence_is_fatal(self):
        err = "! Undefined control sequence."
        assert _is_fatal_error(err)

    def test_missing_dollar_is_fatal(self):
        err = "! Missing $ inserted."
        assert _is_fatal_error(err)

    def test_overfull_hbox_is_nonfatal(self):
        err = "! Overfull \\hbox (12.3pt too wide)"
        assert not _is_fatal_error(err)

    def test_float_lost_is_nonfatal(self):
        err = "! Float(s) lost."
        assert not _is_fatal_error(err)

    def test_unavailable_in_encoding_is_nonfatal(self):
        err = "! Package inputenc Error: Unicode character unavailable in encoding OT1."
        assert not _is_fatal_error(err)

    def test_emergency_stop_is_fatal(self):
        err = "!  ==> Fatal error occurred, no output PDF file produced!"
        assert _is_fatal_error(err)

    def test_non_bang_file_not_found_is_fatal(self):
        err = "File `missing.sty' not found."
        assert _is_fatal_error(err)


# ---------------------------------------------------------------------------
# _sanitize_tex_unicode
# ---------------------------------------------------------------------------

class TestSanitizeTexUnicode:
    """Test that _sanitize_tex_unicode strips problematic Unicode."""

    def test_replaces_narrow_no_break_space(self, tmp_path: Path):
        """BUG-197: U+202F should be replaced with ASCII space."""
        tex = tmp_path / "test.tex"
        tex.write_text("Hello\u202fWorld\n", encoding="utf-8")
        _sanitize_tex_unicode(tex)
        assert tex.read_text(encoding="utf-8") == "Hello World\n"

    def test_replaces_no_break_space(self, tmp_path: Path):
        """U+00A0 should be replaced with ASCII space."""
        tex = tmp_path / "test.tex"
        tex.write_text("Hello\u00a0World\n", encoding="utf-8")
        _sanitize_tex_unicode(tex)
        assert tex.read_text(encoding="utf-8") == "Hello World\n"

    def test_removes_zero_width_space(self, tmp_path: Path):
        """U+200B should be removed entirely."""
        tex = tmp_path / "test.tex"
        tex.write_text("Hello\u200bWorld\n", encoding="utf-8")
        _sanitize_tex_unicode(tex)
        assert tex.read_text(encoding="utf-8") == "HelloWorld\n"

    def test_removes_bom(self, tmp_path: Path):
        """U+FEFF BOM should be removed."""
        tex = tmp_path / "test.tex"
        tex.write_text("\ufeffHello\n", encoding="utf-8")
        _sanitize_tex_unicode(tex)
        assert tex.read_text(encoding="utf-8") == "Hello\n"

    def test_preserves_normal_text(self, tmp_path: Path):
        """Normal ASCII + standard Unicode should be untouched."""
        content = "Hello World, \\section{Intro} $x^2$\n"
        tex = tmp_path / "test.tex"
        tex.write_text(content, encoding="utf-8")
        _sanitize_tex_unicode(tex)
        assert tex.read_text(encoding="utf-8") == content

    def test_handles_multiple_types(self, tmp_path: Path):
        """Multiple types of problematic chars in one file."""
        tex = tmp_path / "test.tex"
        tex.write_text(
            "A\u202fB\u00a0C\u200bD\u200eE\n",
            encoding="utf-8",
        )
        _sanitize_tex_unicode(tex)
        result = tex.read_text(encoding="utf-8")
        assert result == "A B CDE\n"

    def test_nonexistent_file(self, tmp_path: Path):
        """Should not crash on nonexistent file."""
        _sanitize_tex_unicode(tmp_path / "nonexistent.tex")

    def test_cyrillic_transliterated_to_latin(self, tmp_path: Path):
        """BUG-201: Cyrillic author names should be transliterated."""
        tex = tmp_path / "test.tex"
        tex.write_text(
            "А. И. Колесников\n",
            encoding="utf-8",
        )
        _sanitize_tex_unicode(tex)
        result = tex.read_text(encoding="utf-8")
        assert "А" not in result  # no Cyrillic left
        assert "И" not in result
        assert "A. I. Kolesnikov" in result


# ---------------------------------------------------------------------------
# _sanitize_bib_file — Cyrillic transliteration
# ---------------------------------------------------------------------------

class TestSanitizeBibFile:
    """Test _sanitize_bib_file fixes."""

    def test_cyrillic_author_transliterated(self, tmp_path: Path):
        """BUG-201: Cyrillic in bib author names should be transliterated."""
        from researchclaw.templates.compiler import _sanitize_bib_file

        bib = tmp_path / "references.bib"
        bib.write_text(
            '@article{dehghani2023scaling,\n'
            '  author = {А. И. Колесников and J. Doe},\n'
            '  title = {Scaling Vision},\n'
            '}\n',
            encoding="utf-8",
        )
        _sanitize_bib_file(bib)
        result = bib.read_text(encoding="utf-8")
        assert "А" not in result
        assert "A. I. Kolesnikov" in result
        assert "J. Doe" in result  # Latin unchanged


# ---------------------------------------------------------------------------
# fix_common_latex_errors — Unicode handler
# ---------------------------------------------------------------------------

class TestFixUnicodeErrors:
    """Test fix_common_latex_errors for Unicode character issues."""

    def test_unicode_u202f_replaced_with_space(self):
        """BUG-197: U+202F in text should be replaced with space."""
        tex = "Hello\u202fWorld"
        errors = [
            "! LaTeX Error: Unicode character \u202f (U+202F)"
        ]
        fixed, fixes = fix_common_latex_errors(tex, errors)
        assert "\u202f" not in fixed
        assert "Hello World" in fixed
        assert any("U+202F" in f for f in fixes)

    def test_unicode_u200b_removed(self):
        """U+200B (zero-width space, category Cf) should be removed."""
        tex = "Hello\u200bWorld"
        errors = [
            "! LaTeX Error: Unicode character \u200b (U+200B)"
        ]
        fixed, fixes = fix_common_latex_errors(tex, errors)
        assert "\u200b" not in fixed
        assert "HelloWorld" in fixed

    def test_no_unicode_error_no_change(self):
        """Text without the offending char should not be modified."""
        tex = "Hello World"
        errors = [
            "! LaTeX Error: Unicode character \u202f (U+202F)"
        ]
        fixed, fixes = fix_common_latex_errors(tex, errors)
        assert fixed == tex
        # No fix should be applied since the char isn't in the text
        assert not any("U+202F" in f for f in fixes)


# ---------------------------------------------------------------------------
# _run_pdflatex — bytes mode decoding
# ---------------------------------------------------------------------------

class TestRunPdflatexByteMode:
    """Test that _run_pdflatex handles broken UTF-8 in stdout."""

    @patch("researchclaw.templates.compiler.subprocess.run")
    def test_broken_utf8_in_stdout_does_not_crash(self, mock_run):
        """BUG-197: Broken UTF-8 bytes should be decoded with replacement."""
        from researchclaw.templates.compiler import _run_pdflatex

        # Simulate pdflatex returning broken UTF-8 in stdout
        mock_proc = MagicMock()
        mock_proc.stdout = b"Normal output \xe2\x80 broken"  # Invalid UTF-8
        mock_proc.stderr = b""
        mock_proc.returncode = 1
        mock_run.return_value = mock_proc

        log_text, success = _run_pdflatex(Path("/tmp"), "test.tex", timeout=60)

        assert log_text is not None
        assert "Normal output" in log_text
        assert not success

    @patch("researchclaw.templates.compiler.subprocess.run")
    def test_valid_utf8_works(self, mock_run):
        """Normal UTF-8 output should work fine."""
        from researchclaw.templates.compiler import _run_pdflatex

        mock_proc = MagicMock()
        mock_proc.stdout = b"Output written on test.pdf (1 page)"
        mock_proc.stderr = b""
        mock_proc.returncode = 0
        mock_run.return_value = mock_proc

        log_text, success = _run_pdflatex(Path("/tmp"), "test.tex", timeout=60)

        assert log_text is not None
        assert "Output written" in log_text
        assert success


# ---------------------------------------------------------------------------
# _run_bibtex — bytes mode decoding + logging
# ---------------------------------------------------------------------------

class TestRunBibtex:
    """Test that _run_bibtex handles errors and logs properly."""

    @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex")
    @patch("researchclaw.templates.compiler.subprocess.run")
    def test_bibtex_failure_logged(self, mock_run, mock_which, tmp_path):
        """Failed bibtex should log warning and return False."""
        from researchclaw.templates.compiler import _run_bibtex

        mock_proc = MagicMock()
        mock_proc.stdout = b"I couldn't open file name.aux"
        mock_proc.stderr = b""
        mock_proc.returncode = 1
        mock_run.return_value = mock_proc

        result = _run_bibtex(tmp_path, "paper", timeout=60)
        assert result is False

    @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex")
    @patch("researchclaw.templates.compiler.subprocess.run")
    def test_bibtex_success_with_bbl(self, mock_run, mock_which, tmp_path):
        """Successful bibtex with .bbl creation should return True."""
        from researchclaw.templates.compiler import _run_bibtex

        # Create fake .bbl so the check passes
        (tmp_path / "paper.bbl").write_text("\\begin{thebibliography}{}")

        mock_proc = MagicMock()
        mock_proc.stdout = b"Database file #1: references.bib"
        mock_proc.stderr = b""
        mock_proc.returncode = 0
        mock_run.return_value = mock_proc

        result = _run_bibtex(tmp_path, "paper", timeout=60)
        assert result is True

    @patch("researchclaw.templates.compiler.shutil.which", return_value=None)
    def test_bibtex_not_found(self, mock_which, tmp_path):
        """Missing bibtex binary should return False."""
        from researchclaw.templates.compiler import _run_bibtex

        result = _run_bibtex(tmp_path, "paper", timeout=60)
        assert result is False

    @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex")
    @patch("researchclaw.templates.compiler.subprocess.run")
    def test_bibtex_broken_utf8(self, mock_run, mock_which, tmp_path):
        """BUG-197: Broken UTF-8 in bibtex output should not crash."""
        from researchclaw.templates.compiler import _run_bibtex

        (tmp_path / "paper.bbl").write_text("\\begin{thebibliography}{}")

        mock_proc = MagicMock()
        mock_proc.stdout = b"Database file \xe2\x80 broken"
        mock_proc.stderr = b""
        mock_proc.returncode = 0
        mock_run.return_value = mock_proc

        # Should not raise
        result = _run_bibtex(tmp_path, "paper", timeout=60)
        assert result is True


================================================
FILE: tests/test_convergence_evaluator.py
================================================
"""Tests for the convergence study evaluator."""

from __future__ import annotations

import math
import pytest

from researchclaw.experiment.evaluators.convergence import (
    ConvergenceReport,
    ConvergenceResult,
    analyze_convergence,
    compute_convergence_order,
)


# ---------------------------------------------------------------------------
# compute_convergence_order tests
# ---------------------------------------------------------------------------


class TestComputeConvergenceOrder:
    def test_second_order(self):
        """h, h/2, h/4, h/8 with error ~ h^2."""
        hs = [0.1, 0.05, 0.025, 0.0125]
        errors = [h**2 for h in hs]
        order, r2 = compute_convergence_order(hs, errors)
        assert abs(order - 2.0) < 0.1
        assert r2 > 0.99

    def test_fourth_order(self):
        """Error ~ h^4."""
        hs = [0.1, 0.05, 0.025, 0.0125]
        errors = [h**4 for h in hs]
        order, r2 = compute_convergence_order(hs, errors)
        assert abs(order - 4.0) < 0.1
        assert r2 > 0.99

    def test_first_order(self):
        """Error ~ h."""
        hs = [0.1, 0.05, 0.025, 0.0125]
        errors = [h for h in hs]
        order, r2 = compute_convergence_order(hs, errors)
        assert abs(order - 1.0) < 0.1

    def test_too_few_points(self):
        order, r2 = compute_convergence_order([0.1], [0.01])
        assert order == 0.0
        assert r2 == 0.0

    def test_empty_input(self):
        order, r2 = compute_convergence_order([], [])
        assert order == 0.0

    def test_filters_invalid(self):
        hs = [0.1, 0.0, 0.025, -0.01]  # 0 and negative should be filtered
        errors = [0.01, 0.0, 0.001, 0.0001]
        order, r2 = compute_convergence_order(hs, errors)
        # Should still work with valid points
        assert order > 0


# ---------------------------------------------------------------------------
# analyze_convergence tests
# ---------------------------------------------------------------------------


class TestAnalyzeConvergence:
    def test_single_method(self):
        data = {
            "euler": [
                {"h": 0.1, "error": 0.1},
                {"h": 0.05, "error": 0.05},
                {"h": 0.025, "error": 0.025},
            ]
        }
        report = analyze_convergence(data)
        assert len(report.methods) == 1
        assert report.methods[0].method == "euler"
        assert abs(report.methods[0].convergence_order - 1.0) < 0.2
        assert report.best_method == "euler"

    def test_multiple_methods(self):
        data = {
            "euler": [
                {"h": 0.1, "error": 0.1},
                {"h": 0.05, "error": 0.05},
                {"h": 0.025, "error": 0.025},
            ],
            "rk4": [
                {"h": 0.1, "error": 1e-4},
                {"h": 0.05, "error": 6.25e-6},
                {"h": 0.025, "error": 3.9e-7},
            ],
        }
        report = analyze_convergence(data)
        assert len(report.methods) == 2
        # RK4 should have higher order
        orders = {r.method: r.convergence_order for r in report.methods}
        assert orders["rk4"] > orders["euler"]
        assert report.best_method == "rk4"

    def test_expected_orders(self):
        data = {
            "euler": [
                {"h": 0.1, "error": 0.1},
                {"h": 0.05, "error": 0.05},
                {"h": 0.025, "error": 0.025},
            ],
        }
        report = analyze_convergence(data, expected_orders={"euler": 1.0})
        assert report.methods[0].expected_order == 1.0
        assert report.methods[0].order_matches_expected is True

    def test_non_converging(self):
        data = {
            "bad_method": [
                {"h": 0.1, "error": 0.5},
                {"h": 0.05, "error": 0.6},  # error increases
                {"h": 0.025, "error": 0.7},
            ],
        }
        report = analyze_convergence(data)
        # Negative or very low order indicates no convergence
        assert not report.methods[0].is_converging

    def test_summary_string(self):
        data = {
            "method_a": [
                {"h": 0.1, "error": 0.01},
                {"h": 0.05, "error": 0.0025},
            ],
        }
        report = analyze_convergence(data)
        assert report.summary  # should not be empty
        assert "method_a" in report.summary

    def test_l2_error_key(self):
        """Should handle l2_error as the error key."""
        data = {
            "fem": [
                {"h": 0.1, "l2_error": 0.01},
                {"h": 0.05, "l2_error": 0.0025},
                {"h": 0.025, "l2_error": 0.000625},
            ],
        }
        report = analyze_convergence(data)
        assert abs(report.methods[0].convergence_order - 2.0) < 0.2

    def test_empty_data(self):
        report = analyze_convergence({})
        assert len(report.methods) == 0
        assert report.best_method == ""


================================================
FILE: tests/test_copilot.py
================================================
"""Tests for researchclaw.copilot — Interactive Co-Pilot Mode (Agent D2).

30+ tests covering modes, feedback, branching, and controller.
"""

from __future__ import annotations

import json
import shutil
import time
from datetime import date, timedelta
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pytest

from researchclaw.copilot.modes import ResearchMode
from researchclaw.copilot.feedback import (
    FEEDBACK_ACTIONS,
    Feedback,
    FeedbackHandler,
)
from researchclaw.copilot.branching import BranchManager
from researchclaw.copilot.controller import CoPilotController
from researchclaw.config import CoPilotConfig


# ===================================================================
# ResearchMode tests
# ===================================================================


class TestResearchMode:
    def test_all_modes(self):
        assert ResearchMode.CO_PILOT.value == "co-pilot"
        assert ResearchMode.AUTO_PILOT.value == "auto-pilot"
        assert ResearchMode.ZERO_TOUCH.value == "zero-touch"

    def test_from_value(self):
        assert ResearchMode("co-pilot") == ResearchMode.CO_PILOT
        assert ResearchMode("auto-pilot") == ResearchMode.AUTO_PILOT
        assert ResearchMode("zero-touch") == ResearchMode.ZERO_TOUCH

    def test_invalid_mode_raises(self):
        with pytest.raises(ValueError):
            ResearchMode("invalid")

    def test_mode_count(self):
        assert len(ResearchMode) == 3


# ===================================================================
# Feedback tests
# ===================================================================


class TestFeedback:
    def test_feedback_actions_defined(self):
        expected = {"approve", "modify", "retry", "skip", "discuss", "branch", "rollback"}
        assert FEEDBACK_ACTIONS == expected

    def test_feedback_frozen(self):
        fb = Feedback(action="approve", stage=5)
        with pytest.raises(AttributeError):
            fb.action = "retry"  # type: ignore[misc]

    def test_feedback_defaults(self):
        fb = Feedback(action="approve", stage=1)
        assert fb.message == ""
        assert fb.modifications is None
        assert fb.branch_name == ""
        assert fb.rollback_to is None

    def test_feedback_with_modifications(self):
        fb = Feedback(
            action="modify",
            stage=5,
            message="Update hypothesis",
            modifications={"hypothesis": "new hypothesis"},
        )
        assert fb.modifications == {"hypothesis": "new hypothesis"}


# ===================================================================
# FeedbackHandler tests
# ===================================================================


class TestFeedbackHandler:
    def test_write_feedback_request(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        request_path = handler.write_feedback_request(
            stage=5,
            stage_name="LITERATURE_SCREEN",
            summary="10 papers screened",
        )
        assert request_path.exists()
        data = json.loads(request_path.read_text(encoding="utf-8"))
        assert data["stage"] == 5
        assert data["stage_name"] == "LITERATURE_SCREEN"
        assert data["status"] == "waiting"
        assert isinstance(data["options"], list)

    def test_read_feedback_response_valid(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        response = {
            "action": "approve",
            "stage": 5,
            "message": "Looks good",
        }
        resp_path = tmp_path / "copilot_feedback_response.json"
        resp_path.write_text(json.dumps(response), encoding="utf-8")
        fb = handler.read_feedback_response()
        assert fb is not None
        assert fb.action == "approve"
        assert fb.stage == 5
        assert fb.message == "Looks good"

    def test_read_feedback_response_invalid_action(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        response = {"action": "invalid_action", "stage": 5}
        resp_path = tmp_path / "copilot_feedback_response.json"
        resp_path.write_text(json.dumps(response), encoding="utf-8")
        fb = handler.read_feedback_response()
        assert fb is None

    def test_read_feedback_response_missing(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        assert handler.read_feedback_response() is None

    def test_read_feedback_response_malformed(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        resp_path = tmp_path / "copilot_feedback_response.json"
        resp_path.write_text("{invalid json", encoding="utf-8")
        assert handler.read_feedback_response() is None

    def test_read_feedback_response_with_rollback(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        response = {
            "action": "rollback",
            "stage": 15,
            "rollback_to": 8,
        }
        resp_path = tmp_path / "copilot_feedback_response.json"
        resp_path.write_text(json.dumps(response), encoding="utf-8")
        fb = handler.read_feedback_response()
        assert fb is not None
        assert fb.action == "rollback"
        assert fb.rollback_to == 8

    def test_read_feedback_response_branch(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        response = {
            "action": "branch",
            "stage": 9,
            "branch_name": "alt_experiment",
        }
        resp_path = tmp_path / "copilot_feedback_response.json"
        resp_path.write_text(json.dumps(response), encoding="utf-8")
        fb = handler.read_feedback_response()
        assert fb is not None
        assert fb.branch_name == "alt_experiment"

    def test_clear_request(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        handler.write_feedback_request(1, "TOPIC_INIT", "Done")
        handler.clear_request()
        assert not (tmp_path / "copilot_feedback_request.json").exists()

    def test_clear_request_no_file(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        handler.clear_request()  # should not raise

    def test_wait_for_feedback_timeout(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        result = handler.wait_for_feedback(stage=1, timeout_sec=0, poll_interval_sec=0.01)
        assert result is None

    def test_wait_for_feedback_finds_response(self, tmp_path: Path):
        handler = FeedbackHandler(tmp_path)
        # Pre-clear any stale response (wait_for_feedback clears first)
        # Then write a response matching stage
        response = {"action": "approve", "stage": 5}
        resp_path = tmp_path / "copilot_feedback_response.json"

        def write_response():
            """Simulate delayed response writing."""
            time.sleep(0.05)
            resp_path.write_text(json.dumps(response), encoding="utf-8")

        import threading
        t = threading.Thread(target=write_response)
        t.start()
        fb = handler.wait_for_feedback(stage=5, timeout_sec=2, poll_interval_sec=0.02)
        t.join()
        assert fb is not None
        assert fb.action == "approve"


# ===================================================================
# BranchManager tests
# ===================================================================


class TestBranchManager:
    def test_create_branch(self, tmp_path: Path):
        # Create stage dirs
        (tmp_path / "stage-01").mkdir()
        (tmp_path / "stage-01" / "output.json").write_text("{}")
        (tmp_path / "stage-02").mkdir()
        (tmp_path / "stage-02" / "result.txt").write_text("ok")

        bm = BranchManager(tmp_path, max_branches=3)
        branch_path = bm.create_branch("exp_alt", from_stage=2)

        assert Path(branch_path).exists()
        assert (Path(branch_path) / "stage-01" / "output.json").exists()
        assert (Path(branch_path) / "stage-02" / "result.txt").exists()
        assert (Path(branch_path) / "branch_meta.json").exists()

        meta = json.loads(
            (Path(branch_path) / "branch_meta.json").read_text(encoding="utf-8")
        )
        assert meta["name"] == "exp_alt"
        assert meta["from_stage"] == 2

    def test_create_branch_max_reached(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=1)
        bm.create_branch("b1", from_stage=1)
        with pytest.raises(ValueError, match="Maximum branches"):
            bm.create_branch("b2", from_stage=1)

    def test_create_branch_duplicate_name(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=5)
        bm.create_branch("dup", from_stage=1)
        with pytest.raises(ValueError, match="already exists"):
            bm.create_branch("dup", from_stage=1)

    def test_list_branches_empty(self, tmp_path: Path):
        bm = BranchManager(tmp_path)
        assert bm.list_branches() == []

    def test_list_branches(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=5)
        bm.create_branch("alpha", from_stage=1)
        bm.create_branch("beta", from_stage=2)
        branches = bm.list_branches()
        assert len(branches) == 2
        names = {b["name"] for b in branches}
        assert names == {"alpha", "beta"}

    def test_switch_branch(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=3)
        bm.create_branch("test_branch", from_stage=1)
        path = bm.switch_branch("test_branch")
        assert path.exists()

    def test_switch_branch_nonexistent(self, tmp_path: Path):
        bm = BranchManager(tmp_path)
        with pytest.raises(ValueError, match="does not exist"):
            bm.switch_branch("nonexistent")

    def test_delete_branch(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=3)
        bm.create_branch("doomed", from_stage=1)
        assert len(bm.list_branches()) == 1
        bm.delete_branch("doomed")
        assert len(bm.list_branches()) == 0

    def test_delete_branch_nonexistent(self, tmp_path: Path):
        bm = BranchManager(tmp_path)
        with pytest.raises(ValueError, match="does not exist"):
            bm.delete_branch("ghost")

    def test_compare_branches(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=5)
        (tmp_path / "stage-01").mkdir()
        (tmp_path / "stage-02").mkdir()
        bm.create_branch("a", from_stage=2)
        bm.create_branch("b", from_stage=1)
        result = bm.compare_branches("a", "b")
        assert result["branch_a"] == "a"
        assert result["stages_a"] == 2
        assert result["stages_b"] == 1

    def test_compare_branches_nonexistent(self, tmp_path: Path):
        bm = BranchManager(tmp_path, max_branches=3)
        bm.create_branch("real", from_stage=1)
        result = bm.compare_branches("real", "fake")
        assert "error" in result

    def test_count_stages(self, tmp_path: Path):
        (tmp_path / "stage-01").mkdir()
        (tmp_path / "stage-02").mkdir()
        (tmp_path / "other_dir").mkdir()
        assert BranchManager._count_stages(tmp_path) == 2


# ===================================================================
# CoPilotController tests
# ===================================================================


class TestCoPilotController:
    def _make_config(self, **overrides) -> CoPilotConfig:
        defaults = {
            "mode": "co-pilot",
            "pause_at_gates": True,
            "pause_at_every_stage": False,
            "feedback_timeout_sec": 3600,
            "allow_branching": True,
            "max_branches": 3,
        }
        defaults.update(overrides)
        return CoPilotConfig(**defaults)

    def test_should_pause_zero_touch(self, tmp_path: Path):
        config = self._make_config(mode="zero-touch")
        ctrl = CoPilotController(config, tmp_path)
        assert ctrl.should_pause(5, is_gate=True) is False
        assert ctrl.should_pause(1, is_gate=False) is False

    def test_should_pause_auto_pilot_gate(self, tmp_path: Path):
        config = self._make_config(mode="auto-pilot")
        ctrl = CoPilotController(config, tmp_path)
        assert ctrl.should_pause(5, is_gate=True) is True
        assert ctrl.should_pause(1, is_gate=False) is False

    def test_should_pause_auto_pilot_gates_disabled(self, tmp_path: Path):
        config = self._make_config(mode="auto-pilot", pause_at_gates=False)
        ctrl = CoPilotController(config, tmp_path)
        assert ctrl.should_pause(5, is_gate=True) is False

    def test_should_pause_copilot_every_stage(self, tmp_path: Path):
        config = self._make_config(mode="co-pilot", pause_at_every_stage=True)
        ctrl = CoPilotController(config, tmp_path)
        assert ctrl.should_pause(1, is_gate=False) is True
        assert ctrl.should_pause(5, is_gate=True) is True

    def test_should_pause_copilot_gates_only(self, tmp_path: Path):
        config = self._make_config(mode="co-pilot", pause_at_every_stage=False)
        ctrl = CoPilotController(config, tmp_path)
        assert ctrl.should_pause(5, is_gate=True) is True
        assert ctrl.should_pause(1, is_gate=False) is False

    def test_present_stage_result(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        summary = ctrl.present_stage_result(
            stage_num=5,
            stage_name="LITERATURE_SCREEN",
            artifacts=["screen_report.json"],
            status="done",
        )
        assert "Stage 5: LITERATURE_SCREEN" in summary
        assert "Status: done" in summary
        assert "screen_report.json" in summary

    def test_present_stage_result_with_error(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        summary = ctrl.present_stage_result(
            stage_num=12,
            stage_name="EXPERIMENT_RUN",
            artifacts=[],
            status="failed",
            error="CUDA out of memory",
        )
        assert "Error: CUDA out of memory" in summary

    def test_handle_feedback_approve(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="approve", stage=5)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "continue"

    def test_handle_feedback_modify(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(
            action="modify",
            stage=5,
            message="Change approach",
            modifications={"key": "value"},
        )
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "apply_modifications"
        assert result["modifications"] == {"key": "value"}

    def test_handle_feedback_retry(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="retry", stage=12)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "rerun_stage"

    def test_handle_feedback_skip(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="skip", stage=21)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "skip_stage"

    def test_handle_feedback_branch(self, tmp_path: Path):
        config = self._make_config(allow_branching=True)
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="branch", stage=9, branch_name="alt_design")
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "branch_created"
        assert result["branch_name"] == "alt_design"

    def test_handle_feedback_branch_disabled(self, tmp_path: Path):
        config = self._make_config(allow_branching=False)
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="branch", stage=9)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "branching_disabled"

    def test_handle_feedback_branch_max_reached(self, tmp_path: Path):
        config = self._make_config(allow_branching=True, max_branches=1)
        ctrl = CoPilotController(config, tmp_path)
        # Create first branch
        fb1 = Feedback(action="branch", stage=1, branch_name="b1")
        ctrl.handle_feedback(fb1)
        # Second branch should fail
        fb2 = Feedback(action="branch", stage=2, branch_name="b2")
        result = ctrl.handle_feedback(fb2)
        assert result["instruction"] == "branch_failed"

    def test_handle_feedback_rollback(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="rollback", stage=15, rollback_to=8)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "rollback"
        assert result["rollback_to"] == 8

    def test_handle_feedback_unknown_action(self, tmp_path: Path):
        config = self._make_config()
        ctrl = CoPilotController(config, tmp_path)
        # Construct with a technically valid action but unhandled by match
        fb = Feedback(action="discuss", stage=1)
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "continue"

    def test_from_config_zero_touch_returns_none(self, tmp_path: Path):
        config = self._make_config(mode="zero-touch")
        ctrl = CoPilotController.from_config(config, tmp_path)
        assert ctrl is None

    def test_from_config_copilot_returns_controller(self, tmp_path: Path):
        config = self._make_config(mode="co-pilot")
        ctrl = CoPilotController.from_config(config, tmp_path)
        assert ctrl is not None
        assert isinstance(ctrl, CoPilotController)

    def test_from_config_auto_pilot_returns_controller(self, tmp_path: Path):
        config = self._make_config(mode="auto-pilot")
        ctrl = CoPilotController.from_config(config, tmp_path)
        assert ctrl is not None

    def test_handle_feedback_branch_default_name(self, tmp_path: Path):
        config = self._make_config(allow_branching=True)
        ctrl = CoPilotController(config, tmp_path)
        fb = Feedback(action="branch", stage=9)  # no branch_name
        result = ctrl.handle_feedback(fb)
        assert result["instruction"] == "branch_created"
        assert result["branch_name"] == "branch_9"


================================================
FILE: tests/test_decision_agent.py
================================================
"""Tests for FigureDecisionAgent, NanoBananaAgent, and Docker renderer.

Covers:
  - FigureDecisionAgent._parse_decisions() — JSON parsing edge cases
  - FigureDecisionAgent._heuristic_decide() — fallback coverage
  - FigureDecisionAgent._infer_backend() — backend classification
  - FigureDecisionAgent._enforce_bounds() — min/max enforcement
  - NanoBananaAgent._build_prompt() — prompt construction
  - NanoBananaAgent._get_type_guidelines() — guideline lookup
  - RendererAgent._execute_in_docker() — docker command construction
  - strip_thinking_tags() — safety verification
  - End-to-end decision + orchestration with mock LLM
"""

from __future__ import annotations

import json
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from unittest import mock

import pytest


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


@dataclass
class _FakeLLMResponse:
    content: str = ""
    model: str = "gpt-4.1"
    prompt_tokens: int = 100
    completion_tokens: int = 200
    total_tokens: int = 300
    finish_reason: str = "stop"
    truncated: bool = False
    raw: dict = None  # type: ignore[assignment]

    def __post_init__(self):
        if self.raw is None:
            self.raw = {}


class _FakeLLM:
    """Minimal mock LLM client."""

    def __init__(self, response: str = "{}"):
        self._response = response
        self.calls: list[dict[str, Any]] = []

    def chat(self, messages, *, system=None, max_tokens=None,
             temperature=None, json_mode=False, **kwargs):
        self.calls.append({
            "messages": messages,
            "system": system,
            "json_mode": json_mode,
        })
        return _FakeLLMResponse(content=self._response)


# =========================================================================
# FigureDecisionAgent._parse_decisions()
# =========================================================================

class TestParseDecisions:
    """Edge cases for JSON parsing in the decision agent."""

    def _agent(self):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        return FigureDecisionAgent(_FakeLLM())

    def test_valid_json_array(self):
        agent = self._agent()
        raw = json.dumps([
            {
                "section": "Method",
                "figure_type": "architecture_diagram",
                "backend": "image",
                "description": "Architecture overview",
                "priority": 1,
            },
            {
                "section": "Results",
                "figure_type": "bar_comparison",
                "backend": "code",
                "description": "Main results",
                "priority": 1,
            },
        ])
        decisions = agent._parse_decisions(raw)
        assert len(decisions) == 2
        assert decisions[0]["backend"] == "image"
        assert decisions[1]["backend"] == "code"

    def test_json_inside_markdown_fences(self):
        agent = self._agent()
        raw = '```json\n[{"section": "Method", "figure_type": "pipeline_overview", "backend": "image", "description": "Pipeline", "priority": 1}]\n```'
        decisions = agent._parse_decisions(raw)
        assert len(decisions) == 1
        assert decisions[0]["figure_type"] == "pipeline_overview"

    def test_json_with_surrounding_text(self):
        agent = self._agent()
        raw = 'Here are the decisions:\n[{"section": "Results", "figure_type": "heatmap", "backend": "code", "description": "Heatmap", "priority": 2}]\nThat is all.'
        decisions = agent._parse_decisions(raw)
        assert len(decisions) == 1

    def test_no_json_array_raises(self):
        agent = self._agent()
        with pytest.raises(ValueError, match="No JSON array"):
            agent._parse_decisions("This is not JSON at all.")

    def test_empty_array(self):
        agent = self._agent()
        decisions = agent._parse_decisions("[]")
        assert decisions == []

    def test_non_dict_items_skipped(self):
        agent = self._agent()
        raw = json.dumps([
            "not a dict",
            42,
            {"section": "Method", "figure_type": "architecture_diagram",
             "backend": "image", "description": "Arch", "priority": 1},
        ])
        decisions = agent._parse_decisions(raw)
        assert len(decisions) == 1

    def test_invalid_backend_auto_inferred(self):
        agent = self._agent()
        raw = json.dumps([
            {"section": "Method", "figure_type": "architecture_diagram",
             "backend": "invalid_backend", "description": "Arch", "priority": 1},
        ])
        decisions = agent._parse_decisions(raw)
        assert decisions[0]["backend"] == "image"  # architecture → image

    def test_missing_fields_get_defaults(self):
        agent = self._agent()
        raw = json.dumps([{}])
        decisions = agent._parse_decisions(raw)
        assert len(decisions) == 1
        assert decisions[0]["section"] == "Results"
        assert decisions[0]["figure_type"] == "bar_comparison"
        assert decisions[0]["backend"] == "code"
        assert decisions[0]["priority"] == 2


# =========================================================================
# FigureDecisionAgent._heuristic_decide()
# =========================================================================

class TestHeuristicDecide:
    """Test the rule-based fallback decision logic."""

    def _agent(self, min_figures=3, max_figures=10):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        return FigureDecisionAgent(
            _FakeLLM(), min_figures=min_figures, max_figures=max_figures
        )

    def test_with_experiments(self):
        agent = self._agent()
        decisions = agent._heuristic_decide(
            topic="Graph anomaly detection",
            has_experiments=True,
            condition_summaries={"proposed": {}, "baseline": {}, "ablation": {}},
        )
        # Should have: arch_diagram + bar_comparison + training_curve + pipeline
        assert len(decisions) >= 4
        backends = {d["backend"] for d in decisions}
        assert "code" in backends
        assert "image" in backends

    def test_without_experiments(self):
        agent = self._agent()
        decisions = agent._heuristic_decide(
            topic="Theoretical framework",
            has_experiments=False,
            condition_summaries={},
        )
        # Should have: arch_diagram + pipeline (image only, no code)
        assert len(decisions) >= 2
        assert all(d["backend"] == "image" for d in decisions)

    def test_ablation_trigger(self):
        """When >= 4 conditions, an ablation figure should be added."""
        agent = self._agent()
        decisions = agent._heuristic_decide(
            topic="Test",
            has_experiments=True,
            condition_summaries={"a": {}, "b": {}, "c": {}, "d": {}},
        )
        descriptions = [d["description"].lower() for d in decisions]
        assert any("ablation" in desc for desc in descriptions)

    def test_max_figures_respected(self):
        agent = self._agent(max_figures=2)
        decisions = agent._heuristic_decide(
            topic="Test",
            has_experiments=True,
            condition_summaries={"a": {}, "b": {}, "c": {}, "d": {}},
        )
        assert len(decisions) <= 2


# =========================================================================
# FigureDecisionAgent._infer_backend()
# =========================================================================

class TestInferBackend:
    def test_code_types(self):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        code_types = [
            "bar_comparison", "line_chart", "heatmap", "confusion_matrix",
            "training_curve", "ablation_chart", "scatter_plot",
        ]
        for t in code_types:
            assert FigureDecisionAgent._infer_backend(t) == "code", f"Failed for {t}"

    def test_image_types(self):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        image_types = [
            "architecture_diagram", "method_flowchart", "pipeline_overview",
            "concept_illustration", "system_diagram",
        ]
        for t in image_types:
            assert FigureDecisionAgent._infer_backend(t) == "image", f"Failed for {t}"

    def test_unknown_defaults_to_image(self):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        assert FigureDecisionAgent._infer_backend("unknown_chart_type") == "image"


# =========================================================================
# FigureDecisionAgent._enforce_bounds()
# =========================================================================

class TestEnforceBounds:
    def _agent(self, min_figures=3, max_figures=6):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        return FigureDecisionAgent(
            _FakeLLM(), min_figures=min_figures, max_figures=max_figures
        )

    def test_min_padding(self):
        """When fewer than min figures, should pad."""
        agent = self._agent(min_figures=4)
        decisions = [
            {"section": "Results", "figure_type": "bar_comparison",
             "backend": "code", "description": "Test", "priority": 1},
        ]
        result = agent._enforce_bounds(decisions, has_experiments=True)
        assert len(result) >= 4

    def test_max_truncation(self):
        """When more than max figures, should truncate."""
        agent = self._agent(max_figures=3)
        decisions = [
            {"section": f"S{i}", "figure_type": "bar_comparison",
             "backend": "code", "description": f"Fig {i}", "priority": i}
            for i in range(8)
        ]
        result = agent._enforce_bounds(decisions, has_experiments=True)
        assert len(result) <= 3

    def test_ensures_image_figure(self):
        """Should add architecture diagram if none present."""
        agent = self._agent(min_figures=1)
        decisions = [
            {"section": "Results", "figure_type": "bar_comparison",
             "backend": "code", "description": "Bar", "priority": 1},
        ]
        result = agent._enforce_bounds(decisions, has_experiments=True)
        assert any(d["backend"] == "image" for d in result)

    def test_ensures_code_figure_with_experiments(self):
        """Should add bar_comparison if experiments exist but no code figure."""
        agent = self._agent(min_figures=1)
        decisions = [
            {"section": "Method", "figure_type": "architecture_diagram",
             "backend": "image", "description": "Arch", "priority": 1},
        ]
        result = agent._enforce_bounds(decisions, has_experiments=True)
        assert any(d["backend"] == "code" for d in result)


# =========================================================================
# NanoBananaAgent._build_prompt()
# =========================================================================

class TestBuildPrompt:
    def _agent(self):
        from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
        return NanoBananaAgent(
            _FakeLLM(), gemini_api_key="fake-key", use_sdk=False,
        )

    def test_prompt_contains_description(self):
        agent = self._agent()
        prompt = agent._build_prompt(
            description="Encoder-decoder with attention",
            figure_type="architecture_diagram",
            section="Method",
            topic="Graph anomaly detection",
        )
        assert "Encoder-decoder with attention" in prompt
        assert "Method" in prompt
        assert "Graph anomaly detection" in prompt

    def test_prompt_contains_style(self):
        agent = self._agent()
        prompt = agent._build_prompt(
            description="Test",
            figure_type="architecture_diagram",
            section="Method",
            topic="Test",
        )
        assert "academic" in prompt.lower()
        assert "publication" in prompt.lower()

    def test_prompt_varies_by_type(self):
        agent = self._agent()
        arch_prompt = agent._build_prompt(
            description="Test", figure_type="architecture_diagram",
            section="Method", topic="Test",
        )
        flow_prompt = agent._build_prompt(
            description="Test", figure_type="method_flowchart",
            section="Method", topic="Test",
        )
        # Different guidelines for different types
        assert arch_prompt != flow_prompt


# =========================================================================
# NanoBananaAgent._get_type_guidelines()
# =========================================================================

class TestGetTypeGuidelines:
    def test_known_types(self):
        from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
        known = [
            "architecture_diagram", "method_flowchart", "pipeline_overview",
            "concept_illustration", "system_diagram", "attention_visualization",
            "comparison_illustration",
        ]
        for t in known:
            g = NanoBananaAgent._get_type_guidelines(t)
            assert len(g) > 0, f"Empty guidelines for {t}"

    def test_unknown_type_falls_back(self):
        from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
        g = NanoBananaAgent._get_type_guidelines("totally_unknown")
        fallback = NanoBananaAgent._get_type_guidelines("concept_illustration")
        assert g == fallback


# =========================================================================
# NanoBananaAgent — no API key
# =========================================================================

class TestNanoBananaNoKey:
    def test_execute_without_key_fails(self, tmp_path):
        from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
        # Clear env
        with mock.patch.dict(os.environ, {}, clear=True):
            agent = NanoBananaAgent(
                _FakeLLM(), gemini_api_key="", use_sdk=False,
            )
            result = agent.execute({
                "image_figures": [
                    {"figure_id": "fig_1", "description": "Test",
                     "figure_type": "architecture_diagram", "section": "Method"},
                ],
                "topic": "Test",
                "output_dir": str(tmp_path),
            })
            assert not result.success
            assert "API key" in result.error

    def test_execute_empty_figures_succeeds(self, tmp_path):
        from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent
        with mock.patch.dict(os.environ, {}, clear=True):
            agent = NanoBananaAgent(
                _FakeLLM(), gemini_api_key="", use_sdk=False,
            )
            result = agent.execute({
                "image_figures": [],
                "topic": "Test",
                "output_dir": str(tmp_path),
            })
            assert result.success
            assert result.data["count"] == 0


# =========================================================================
# RendererAgent._execute_in_docker() — Docker command construction
# =========================================================================

class TestDockerRenderer:
    def _agent(self):
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        return RendererAgent(
            _FakeLLM(),
            timeout_sec=10,
            use_docker=True,
            docker_image="researchclaw/experiment:latest",
        )

    def test_docker_command_construction(self, tmp_path):
        """Verify docker command includes security flags."""
        agent = self._agent()
        script_path = tmp_path / "scripts" / "fig_test.py"
        script_path.parent.mkdir(parents=True, exist_ok=True)
        script_path.write_text("print('hello')")
        output_dir = tmp_path / "output"
        output_dir.mkdir()

        with mock.patch("subprocess.run") as mock_run:
            mock_run.return_value = subprocess.CompletedProcess(
                args=[], returncode=0, stdout="", stderr=""
            )
            agent._execute_in_docker(
                script_path=script_path,
                output_dir=output_dir,
                figure_id="fig_test",
            )
            args = mock_run.call_args
            cmd = args[0][0]
            # Verify security flags
            assert "--network" in cmd
            assert "none" in cmd
            assert "--read-only" in cmd
            assert "--rm" in cmd
            assert "--memory=512m" in cmd
            # Verify mount binds
            cmd_str = " ".join(cmd)
            assert "script.py:ro" in cmd_str  # read-only script
            assert "output:rw" in cmd_str  # writable output

    def test_docker_timeout_kills_container(self, tmp_path):
        """Verify container is killed on timeout."""
        agent = self._agent()
        script_path = tmp_path / "scripts" / "fig_timeout.py"
        script_path.parent.mkdir(parents=True, exist_ok=True)
        script_path.write_text("import time; time.sleep(999)")
        output_dir = tmp_path / "output"
        output_dir.mkdir()

        with mock.patch("subprocess.run") as mock_run:
            mock_run.side_effect = subprocess.TimeoutExpired(
                cmd=["docker", "run"], timeout=10
            )
            result = agent._execute_in_docker(
                script_path=script_path,
                output_dir=output_dir,
                figure_id="fig_timeout",
            )
            assert "timed out" in result["error"]

    def test_docker_not_found(self, tmp_path):
        """Verify graceful handling when Docker is not installed."""
        agent = self._agent()
        script_path = tmp_path / "scripts" / "fig_no_docker.py"
        script_path.parent.mkdir(parents=True, exist_ok=True)
        script_path.write_text("print('hello')")
        output_dir = tmp_path / "output"
        output_dir.mkdir()

        with mock.patch("subprocess.run") as mock_run:
            mock_run.side_effect = FileNotFoundError("docker not found")
            result = agent._execute_in_docker(
                script_path=script_path,
                output_dir=output_dir,
                figure_id="fig_no_docker",
            )
            assert "not found" in result["error"]

    def test_docker_script_failure(self, tmp_path):
        """Verify error message includes stderr on non-zero exit."""
        agent = self._agent()
        script_path = tmp_path / "scripts" / "fig_fail.py"
        script_path.parent.mkdir(parents=True, exist_ok=True)
        script_path.write_text("raise Exception('boom')")
        output_dir = tmp_path / "output"
        output_dir.mkdir()

        with mock.patch("subprocess.run") as mock_run:
            mock_run.return_value = subprocess.CompletedProcess(
                args=[], returncode=1,
                stdout="", stderr="Traceback: Exception: boom",
            )
            result = agent._execute_in_docker(
                script_path=script_path,
                output_dir=output_dir,
                figure_id="fig_fail",
            )
            assert result["error"]
            assert "boom" in result["error"]


# =========================================================================
# strip_thinking_tags() — safety tests
# =========================================================================

class TestStripThinkingTags:
    def test_closed_tags_removed(self):
        from researchclaw.utils.thinking_tags import strip_thinking_tags
        text = "Hello <think>internal reasoning</think> World"
        assert strip_thinking_tags(text) == "Hello  World"

    def test_no_tags(self):
        from researchclaw.utils.thinking_tags import strip_thinking_tags
        text = "Normal text without tags"
        assert strip_thinking_tags(text) == text

    def test_empty_string(self):
        from researchclaw.utils.thinking_tags import strip_thinking_tags
        assert strip_thinking_tags("") == ""

    def test_nested_code_preserved(self):
        """Literal <think> in code blocks should NOT be corrupted
        when used by chat() without strip_thinking=True."""
        text = '```python\n# The <think> tag is used by...\nprint("hello")\n```'
        # Without stripping, text is untouched
        assert "<think>" in text

    def test_unclosed_tag_behavior(self):
        """Document the behavior: unclosed <think> removes everything after it."""
        from researchclaw.utils.thinking_tags import strip_thinking_tags
        text = "Prefix <think>reasoning that never closes"
        result = strip_thinking_tags(text)
        # The unclosed tag strips everything after <think>
        assert "Prefix" in result
        assert "reasoning" not in result


# =========================================================================
# FigureDecisionAgent.execute() — full integration with mock LLM
# =========================================================================

class TestDecisionAgentExecute:
    def test_llm_decision(self):
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        llm_response = json.dumps([
            {"section": "Method", "figure_type": "architecture_diagram",
             "backend": "image", "description": "Arch", "priority": 1},
            {"section": "Results", "figure_type": "bar_comparison",
             "backend": "code", "description": "Results", "priority": 1},
            {"section": "Results", "figure_type": "heatmap",
             "backend": "code", "description": "Heatmap", "priority": 2},
        ])
        agent = FigureDecisionAgent(_FakeLLM(llm_response), min_figures=3)
        result = agent.execute({
            "topic": "Graph anomaly detection",
            "hypothesis": "GRACE improves detection",
            "paper_draft": "# Introduction\n...",
            "has_experiments": True,
            "condition_summaries": {"proposed": {}, "baseline": {}},
        })
        assert result.success
        assert result.data["total"] >= 3
        assert len(result.data["code_figures"]) >= 1
        assert len(result.data["image_figures"]) >= 1

    def test_fallback_on_bad_llm(self):
        """When LLM returns garbage, heuristic fallback should kick in."""
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        agent = FigureDecisionAgent(
            _FakeLLM("This is not JSON"),
            min_figures=3,
        )
        result = agent.execute({
            "topic": "Test topic",
            "has_experiments": True,
            "condition_summaries": {"a": {}, "b": {}},
        })
        assert result.success  # fallback succeeds
        assert result.data["total"] >= 3

    def test_fallback_on_no_llm(self):
        """When LLM is None, heuristic fallback should work."""
        from researchclaw.agents.figure_agent.decision import FigureDecisionAgent
        agent = FigureDecisionAgent(None, min_figures=2)
        result = agent.execute({
            "topic": "Test",
            "has_experiments": False,
            "condition_summaries": {},
        })
        assert result.success
        assert result.data["total"] >= 2


# =========================================================================
# CWD regression test (Issue #2)
# =========================================================================

class TestRendererCwd:
    """Verify the CWD is set to output_dir, not its parent."""

    def test_local_cwd_is_output_dir(self, tmp_path):
        """Scripts using relative savefig should write to output_dir."""
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        agent = RendererAgent(_FakeLLM(), timeout_sec=10, use_docker=False)
        output_dir = tmp_path / "charts"

        with mock.patch("subprocess.run") as mock_run:
            mock_run.return_value = subprocess.CompletedProcess(
                args=[], returncode=0, stdout="", stderr=""
            )
            agent._execute_local(
                script_path=tmp_path / "test.py",
                output_dir=output_dir,
            )
            call_kwargs = mock_run.call_args
            cwd = call_kwargs[1]["cwd"] if isinstance(call_kwargs[1], dict) else None
            # CWD should be output_dir, NOT output_dir.parent
            assert cwd == str(output_dir.resolve())


# =========================================================================
# chat(strip_thinking=True) — opt-in parameter (Issue #1 fix)
# =========================================================================

class TestChatStripThinking:
    """Verify the opt-in strip_thinking parameter on LLMClient.chat()."""

    def test_strip_thinking_false_by_default(self):
        """Default chat() should NOT strip <think> tags."""
        from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse

        config = LLMConfig(
            base_url="http://fake",
            api_key="fake-key",
            primary_model="test-model",
        )
        client = LLMClient(config)

        response_with_think = (
            '<think>internal reasoning</think>The actual answer is 42.'
        )
        fake_api_response = {
            "choices": [{
                "message": {"content": response_with_think},
                "finish_reason": "stop",
            }],
            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
            "model": "test-model",
        }

        with mock.patch("urllib.request.urlopen") as mock_urlopen:
            mock_resp = mock.MagicMock()
            mock_resp.read.return_value = json.dumps(fake_api_response).encode()
            mock_resp.__enter__ = mock.MagicMock(return_value=mock_resp)
            mock_resp.__exit__ = mock.MagicMock(return_value=False)
            mock_urlopen.return_value = mock_resp

            result = client.chat(
                [{"role": "user", "content": "test"}],
                strip_thinking=False,
            )
            # With strip_thinking=False, <think> tags are preserved
            assert "<think>" in result.content

    def test_strip_thinking_true_removes_tags(self):
        """chat(strip_thinking=True) should strip <think> tags."""
        from researchclaw.llm.client import LLMClient, LLMConfig

        config = LLMConfig(
            base_url="http://fake",
            api_key="fake-key",
            primary_model="test-model",
        )
        client = LLMClient(config)

        response_with_think = (
            '<think>internal reasoning</think>The actual answer is 42.'
        )
        fake_api_response = {
            "choices": [{
                "message": {"content": response_with_think},
                "finish_reason": "stop",
            }],
            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
            "model": "test-model",
        }

        with mock.patch("urllib.request.urlopen") as mock_urlopen:
            mock_resp = mock.MagicMock()
            mock_resp.read.return_value = json.dumps(fake_api_response).encode()
            mock_resp.__enter__ = mock.MagicMock(return_value=mock_resp)
            mock_resp.__exit__ = mock.MagicMock(return_value=False)
            mock_urlopen.return_value = mock_resp

            result = client.chat(
                [{"role": "user", "content": "test"}],
                strip_thinking=True,
            )
            # With strip_thinking=True, <think> tags are removed
            assert "<think>" not in result.content
            assert "The actual answer is 42." in result.content


# =========================================================================
# LaTeX converter — display math $$...$$ fix
# =========================================================================

class TestLatexDisplayMath:
    """Verify the $$...$$ → equation environment fix in converter.py."""

    def test_dollar_dollar_to_equation(self):
        """$$...$$ display math should become \\begin{equation}."""
        from researchclaw.templates.converter import _convert_block

        md = (
            "Some text before.\n"
            "\n"
            "$$\\alpha_{ij} = \\frac{x}{y}$$\n"
            "\n"
            "Some text after."
        )
        result = _convert_block(md)
        assert "\\begin{equation}" in result
        assert "\\end{equation}" in result
        assert "\\alpha_{ij}" in result
        # Should NOT contain escaped $$
        assert "\\$\\$" not in result

    def test_multiline_dollar_dollar(self):
        """$$...$$ spanning multiple lines should also convert."""
        from researchclaw.templates.converter import _convert_block

        md = (
            "$$\n"
            "\\mathcal{L} = -\\log \\frac{a}{b}\n"
            "$$\n"
        )
        result = _convert_block(md)
        assert "\\begin{equation}" in result
        assert "\\mathcal{L}" in result

    def test_inline_dollar_dollar_not_escaped(self):
        """$$ in inline context should not be escaped to \\$\\$."""
        from researchclaw.templates.converter import _convert_inline

        text = "The formula $$x+y$$ is important"
        result = _convert_inline(text)
        # Should not contain \\textasciicircum or \\$
        assert "\\textasciicircum" not in result


# =========================================================================
# LaTeX converter — figure [t] placement
# =========================================================================

class TestLatexFigurePlacement:
    """Verify figures use [t] placement specifier."""

    def test_figure_uses_top_placement(self):
        from researchclaw.templates.converter import _render_figure

        result = _render_figure("Test Caption", "charts/test.png")
        assert "\\begin{figure}[t]" in result
        assert "[ht]" not in result

    def test_figure_has_centering(self):
        from researchclaw.templates.converter import _render_figure

        result = _render_figure("My Figure", "path/to/image.png")
        assert "\\centering" in result
        assert "\\includegraphics" in result
        assert "\\caption{My Figure}" in result
        assert "\\label{fig:" in result


# =========================================================================
# Pipeline wrapper — _chat_with_prompt strip_thinking default
# =========================================================================


class TestChatWithPromptStripThinking:
    """Verify _chat_with_prompt passes strip_thinking to llm.chat()."""

    def test_default_strips_thinking(self):
        """_chat_with_prompt should pass strip_thinking=True by default."""
        from unittest.mock import MagicMock
        from researchclaw.pipeline.executor import _chat_with_prompt
        from researchclaw.llm.client import LLMResponse

        mock_llm = MagicMock()
        mock_llm.chat.return_value = LLMResponse(
            content="clean output", model="test", finish_reason="stop",
        )

        result = _chat_with_prompt(mock_llm, system="sys", user="hello")

        call_kwargs = mock_llm.chat.call_args
        assert call_kwargs.kwargs.get("strip_thinking") is True

    def test_can_disable_stripping(self):
        """_chat_with_prompt(strip_thinking=False) should forward the flag."""
        from unittest.mock import MagicMock
        from researchclaw.pipeline.executor import _chat_with_prompt
        from researchclaw.llm.client import LLMResponse

        mock_llm = MagicMock()
        mock_llm.chat.return_value = LLMResponse(
            content="<think>reasoning</think>output",
            model="test", finish_reason="stop",
        )

        _chat_with_prompt(
            mock_llm, system="sys", user="hello", strip_thinking=False,
        )

        call_kwargs = mock_llm.chat.call_args
        assert call_kwargs.kwargs.get("strip_thinking") is False


================================================
FILE: tests/test_domain_detector.py
================================================
"""Tests for domain detection and profile loading."""

from __future__ import annotations

import pytest
from pathlib import Path

from researchclaw.domains.detector import (
    DomainProfile,
    ExperimentParadigm,
    MetricType,
    detect_domain,
    detect_domain_id,
    get_generic_profile,
    get_profile,
    is_ml_domain,
    load_all_profiles,
    _keyword_detect,
    _profile_cache,
)


# ---------------------------------------------------------------------------
# Profile loading tests
# ---------------------------------------------------------------------------


class TestProfileLoading:
    def setup_method(self):
        _profile_cache.clear()

    def test_load_all_profiles_returns_dict(self):
        profiles = load_all_profiles()
        assert isinstance(profiles, dict)
        assert len(profiles) >= 10  # we created 14 profiles

    def test_profiles_have_required_fields(self):
        profiles = load_all_profiles()
        for domain_id, profile in profiles.items():
            assert profile.domain_id == domain_id
            assert profile.display_name
            assert profile.experiment_paradigm
            assert profile.entry_point

    def test_get_profile_existing(self):
        profile = get_profile("ml_vision")
        assert profile is not None
        assert profile.domain_id == "ml_vision"
        assert profile.display_name == "Computer Vision (ML)"
        assert profile.gpu_required is True

    def test_get_profile_nonexistent(self):
        profile = get_profile("nonexistent_domain_xyz")
        assert profile is None

    def test_get_generic_profile(self):
        profile = get_generic_profile()
        assert profile.domain_id == "generic"
        assert "numpy" in profile.core_libraries

    def test_ml_profiles_exist(self):
        for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]:
            profile = get_profile(domain_id)
            assert profile is not None, f"Missing profile: {domain_id}"

    def test_physics_profiles_exist(self):
        for domain_id in ["physics_simulation", "physics_pde"]:
            profile = get_profile(domain_id)
            assert profile is not None, f"Missing profile: {domain_id}"

    def test_other_domain_profiles_exist(self):
        for domain_id in [
            "mathematics_numerical",
            "chemistry_qm",
            "chemistry_molprop",
            "biology_singlecell",
            "economics_empirical",
            "security_detection",
            "robotics_control",
        ]:
            profile = get_profile(domain_id)
            assert profile is not None, f"Missing profile: {domain_id}"

    def test_physics_profile_paradigm(self):
        profile = get_profile("physics_pde")
        assert profile is not None
        assert profile.experiment_paradigm == "convergence"
        assert "convergence_order_fit" in profile.statistical_tests

    def test_economics_profile_paradigm(self):
        profile = get_profile("economics_empirical")
        assert profile is not None
        assert profile.experiment_paradigm == "progressive_spec"
        assert "hausman_test" in profile.statistical_tests


# ---------------------------------------------------------------------------
# Keyword detection tests
# ---------------------------------------------------------------------------


class TestKeywordDetection:
    def test_ml_vision_keywords(self):
        assert _keyword_detect("image classification with ResNet") == "ml_vision"
        assert _keyword_detect("convolutional neural network for object detection") == "ml_vision"

    def test_ml_nlp_keywords(self):
        assert _keyword_detect("text classification using BERT") == "ml_nlp"
        assert _keyword_detect("natural language processing transformer") == "ml_nlp"

    def test_ml_rl_keywords(self):
        assert _keyword_detect("reinforcement learning policy gradient") == "ml_rl"
        assert _keyword_detect("actor-critic algorithm for robot control") == "ml_rl"

    def test_physics_keywords(self):
        assert _keyword_detect("molecular dynamics simulation with Lennard-Jones") == "physics_simulation"
        assert _keyword_detect("finite element method for Navier-Stokes equation") == "physics_pde"

    def test_chemistry_keywords(self):
        assert _keyword_detect("DFT calculation with PySCF") == "chemistry_qm"
        assert _keyword_detect("molecular property prediction using RDKit fingerprints") == "chemistry_molprop"

    def test_biology_keywords(self):
        assert _keyword_detect("single-cell RNA-seq analysis with scanpy") == "biology_singlecell"

    def test_economics_keywords(self):
        assert _keyword_detect("panel data regression with fixed effects") == "economics_empirical"
        assert _keyword_detect("instrumental variable causal inference") == "economics_empirical"

    def test_math_keywords(self):
        assert _keyword_detect("Runge-Kutta ODE solver convergence") == "mathematics_numerical"
        assert _keyword_detect("numerical analysis of quadrature methods") == "mathematics_numerical"

    def test_security_keywords(self):
        assert _keyword_detect("intrusion detection system for network traffic") == "security_detection"

    def test_robotics_keywords(self):
        assert _keyword_detect("robot manipulation with MuJoCo") == "robotics_control"

    def test_generic_ml_fallback(self):
        assert _keyword_detect("neural network training with pytorch") == "ml_generic"
        assert _keyword_detect("deep learning for regression") == "ml_generic"

    def test_unknown_topic(self):
        assert _keyword_detect("cooking recipes for italian food") is None

    def test_case_insensitive(self):
        assert _keyword_detect("IMAGE CLASSIFICATION WITH RESNET") == "ml_vision"
        assert _keyword_detect("DFT Calculation") == "chemistry_qm"


# ---------------------------------------------------------------------------
# detect_domain tests
# ---------------------------------------------------------------------------


class TestDetectDomain:
    def test_detect_ml_vision(self):
        profile = detect_domain("image classification on CIFAR-10")
        assert is_ml_domain(profile)
        assert profile.domain_id == "ml_vision"

    def test_detect_physics(self):
        profile = detect_domain("molecular dynamics simulation of Lennard-Jones fluid")
        assert profile.domain_id == "physics_simulation"
        assert not is_ml_domain(profile)

    def test_detect_with_hypotheses(self):
        profile = detect_domain(
            topic="novel numerical scheme",
            hypotheses="We propose a 4th order finite difference scheme for the Poisson equation",
        )
        assert profile.domain_id == "physics_pde"

    def test_detect_generic_fallback(self):
        profile = detect_domain("studying the behavior of abstract systems")
        assert profile.domain_id == "generic"

    def test_detect_domain_id_shortcut(self):
        domain_id = detect_domain_id("image classification")
        assert domain_id == "ml_vision"

        domain_id = detect_domain_id("cooking recipes")
        assert domain_id == "generic"


# ---------------------------------------------------------------------------
# is_ml_domain tests
# ---------------------------------------------------------------------------


class TestIsMLDomain:
    def test_ml_domains(self):
        for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]:
            profile = get_profile(domain_id)
            assert profile is not None
            assert is_ml_domain(profile)

    def test_non_ml_domains(self):
        for domain_id in ["physics_simulation", "chemistry_qm", "economics_empirical"]:
            profile = get_profile(domain_id)
            assert profile is not None
            assert not is_ml_domain(profile)

    def test_generic_not_ml(self):
        profile = get_generic_profile()
        assert not is_ml_domain(profile)


# ---------------------------------------------------------------------------
# DomainProfile dataclass tests
# ---------------------------------------------------------------------------


class TestDomainProfile:
    def test_default_values(self):
        profile = DomainProfile(domain_id="test", display_name="Test")
        assert profile.experiment_paradigm == ExperimentParadigm.COMPARISON.value
        assert profile.entry_point == "main.py"
        assert profile.gpu_required is False
        assert "paired_t_test" in profile.statistical_tests

    def test_custom_values(self):
        profile = DomainProfile(
            domain_id="custom",
            display_name="Custom Domain",
            experiment_paradigm="convergence",
            gpu_required=True,
            core_libraries=["numpy", "custom_lib"],
        )
        assert profile.experiment_paradigm == "convergence"
        assert profile.gpu_required is True
        assert "custom_lib" in profile.core_libraries


# ---------------------------------------------------------------------------
# Enum tests
# ---------------------------------------------------------------------------


class TestEnums:
    def test_experiment_paradigm_values(self):
        assert ExperimentParadigm.COMPARISON.value == "comparison"
        assert ExperimentParadigm.CONVERGENCE.value == "convergence"
        assert ExperimentParadigm.PROGRESSIVE_SPEC.value == "progressive_spec"
        assert ExperimentParadigm.SIMULATION.value == "simulation"

    def test_metric_type_values(self):
        assert MetricType.SCALAR.value == "scalar"
        assert MetricType.TABLE.value == "table"
        assert MetricType.CONVERGENCE.value == "convergence"


# ---------------------------------------------------------------------------
# Domain detection accuracy test (50-topic benchmark)
# ---------------------------------------------------------------------------


class TestDetectionAccuracy:
    """Test domain detection accuracy on a diverse set of topics."""

    TOPIC_EXPECTATIONS = [
        # ML topics
        ("Image classification with ResNet on CIFAR-10", "ml_vision"),
        ("Object detection using YOLO", "ml_vision"),
        ("Text sentiment analysis with BERT", "ml_nlp"),
        ("Language model fine-tuning", "ml_nlp"),
        ("Reinforcement learning for Atari games", "ml_rl"),
        ("Policy gradient optimization in continuous control", "ml_rl"),
        ("Graph neural network for node classification", "ml_graph"),
        ("Knowledge distillation from large teacher models", "ml_compression"),
        ("GAN for image synthesis", "ml_generative"),
        ("Tabular data prediction with XGBoost", "ml_tabular"),
        ("Deep learning regression model", "ml_generic"),
        ("Neural network for time series forecasting", "ml_generic"),
        # Physics topics
        ("Molecular dynamics of Lennard-Jones particles", "physics_simulation"),
        ("N-body gravitational simulation", "physics_simulation"),
        ("Symplectic integrator for Hamiltonian systems", "physics_simulation"),
        ("Finite element solution of Poisson equation", "physics_pde"),
        ("Heat equation solver comparison", "physics_pde"),
        ("Navier-Stokes finite difference scheme", "physics_pde"),
        # Chemistry topics
        ("Hartree-Fock calculation for small molecules", "chemistry_qm"),
        ("DFT energy with PySCF", "chemistry_qm"),
        ("Molecular property prediction from SMILES", "chemistry_molprop"),
        ("Drug binding affinity with RDKit fingerprints", "chemistry_molprop"),
        # Biology topics
        ("Single-cell clustering with scanpy", "biology_singlecell"),
        ("scRNA-seq differential expression analysis", "biology_singlecell"),
        ("Genome variant calling pipeline", "biology_genomics"),
        ("Protein folding prediction", "biology_protein"),
        # Economics topics
        ("Panel data regression with fixed effects", "economics_empirical"),
        ("Instrumental variable estimation", "economics_empirical"),
        ("Causal inference with difference-in-differences", "economics_empirical"),
        # Math topics
        ("Runge-Kutta ODE solver convergence analysis", "mathematics_numerical"),
        ("Numerical quadrature comparison", "mathematics_numerical"),
        ("Convex optimization benchmark", "mathematics_optimization"),
        # Security topics
        ("Network intrusion detection system", "security_detection"),
        ("Malware classification using random forest", "security_detection"),
        # Robotics topics
        ("Robot manipulation policy learning", "robotics_control"),
        ("Locomotion control with MuJoCo", "robotics_control"),
    ]

    def test_keyword_detection_accuracy(self):
        """Test that keyword detection achieves > 90% accuracy."""
        correct = 0
        total = len(self.TOPIC_EXPECTATIONS)

        for topic, expected_domain in self.TOPIC_EXPECTATIONS:
            detected = _keyword_detect(topic)
            if detected == expected_domain:
                correct += 1

        accuracy = correct / total
        assert accuracy > 0.90, (
            f"Keyword detection accuracy: {accuracy:.1%} ({correct}/{total}). "
            f"Expected > 90%."
        )

    def test_full_detection_accuracy(self):
        """Test that full detect_domain achieves > 90% accuracy."""
        correct = 0
        total = len(self.TOPIC_EXPECTATIONS)

        for topic, expected_domain in self.TOPIC_EXPECTATIONS:
            profile = detect_domain(topic)
            if profile.domain_id == expected_domain:
                correct += 1

        accuracy = correct / total
        assert accuracy > 0.90, (
            f"Full detection accuracy: {accuracy:.1%} ({correct}/{total}). "
            f"Expected > 90%."
        )


================================================
FILE: tests/test_entry_point_validation.py
================================================
"""Tests for entry point path traversal validation."""

from __future__ import annotations

from pathlib import Path
from unittest.mock import patch

from researchclaw.experiment.sandbox import (
    ExperimentSandbox,
    validate_entry_point,
    validate_entry_point_resolved,
)


# ── Unit tests: validate_entry_point (syntax) ─────────────────────────


class TestValidateEntryPoint:
    """Syntax-only checks — no filesystem needed."""

    def test_valid_entry_point(self) -> None:
        assert validate_entry_point("main.py") is None

    def test_valid_nested_entry_point(self) -> None:
        assert validate_entry_point("src/train.py") is None

    def test_valid_dot_slash_prefix(self) -> None:
        assert validate_entry_point("./main.py") is None

    def test_valid_dot_in_middle(self) -> None:
        assert validate_entry_point("src/./train.py") is None

    def test_valid_deeply_nested(self) -> None:
        assert validate_entry_point("a/b/c/d/main.py") is None

    def test_rejects_absolute_path(self) -> None:
        err = validate_entry_point("/etc/passwd")
        assert err is not None
        assert "relative" in err.lower() or "absolute" in err.lower()

    def test_rejects_path_traversal(self) -> None:
        err = validate_entry_point("../../../etc/passwd")
        assert err is not None
        assert ".." in err

    def test_rejects_dotdot_in_middle(self) -> None:
        err = validate_entry_point("src/../../etc/passwd")
        assert err is not None
        assert ".." in err

    def test_rejects_empty_string(self) -> None:
        err = validate_entry_point("")
        assert err is not None
        assert "empty" in err.lower()

    def test_rejects_whitespace_only(self) -> None:
        err = validate_entry_point("   ")
        assert err is not None
        assert "empty" in err.lower()


# ── Unit tests: validate_entry_point_resolved (containment) ───────────


class TestValidateEntryPointResolved:
    """Resolve-based checks — needs a real staging directory."""

    def test_valid_path_passes(self, tmp_path: Path) -> None:
        (tmp_path / "main.py").write_text("pass")
        assert validate_entry_point_resolved(tmp_path, "main.py") is None

    def test_symlink_escape_rejected(self, tmp_path: Path) -> None:
        """A symlink pointing outside staging must be caught."""
        escape_target = tmp_path / "outside" / "secret.py"
        escape_target.parent.mkdir()
        escape_target.write_text("print('escaped!')")

        staging = tmp_path / "staging"
        staging.mkdir()
        (staging / "legit.py").symlink_to(escape_target)

        err = validate_entry_point_resolved(staging, "legit.py")
        assert err is not None
        assert "escapes" in err.lower()

    def test_nested_valid_path_passes(self, tmp_path: Path) -> None:
        sub = tmp_path / "src"
        sub.mkdir()
        (sub / "train.py").write_text("pass")
        assert validate_entry_point_resolved(tmp_path, "src/train.py") is None


# ── Integration tests: ExperimentSandbox.run_project() ────────────────


class TestExperimentSandboxEntryPointValidation:
    """Verify validation is wired into ExperimentSandbox.run_project()."""

    def _make_sandbox(self, tmp_path: Path) -> ExperimentSandbox:
        from researchclaw.config import SandboxConfig

        cfg = SandboxConfig()
        return ExperimentSandbox(cfg, tmp_path / "work")

    def test_rejects_path_traversal(self, tmp_path: Path) -> None:
        project = tmp_path / "proj"
        project.mkdir()
        (project / "main.py").write_text("print('hi')")

        sandbox = self._make_sandbox(tmp_path)
        # Create escape target so .exists() alone wouldn't catch it
        work = tmp_path / "work"
        work.mkdir(parents=True, exist_ok=True)
        (work / "escape.py").write_text("print('escaped!')")

        with patch("subprocess.run") as mock_run:
            result = sandbox.run_project(project, entry_point="../escape.py")

        assert result.returncode == -1
        assert ".." in result.stderr
        mock_run.assert_not_called()

    def test_rejects_absolute_path(self, tmp_path: Path) -> None:
        project = tmp_path / "proj"
        project.mkdir()
        (project / "main.py").write_text("print('hi')")

        sandbox = self._make_sandbox(tmp_path)

        with patch("subprocess.run") as mock_run:
            result = sandbox.run_project(project, entry_point="/etc/passwd")

        assert result.returncode == -1
        assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower()
        mock_run.assert_not_called()

    # NOTE: A symlink integration test is not included here because the
    # copy loop (write_bytes/read_bytes) follows symlinks and creates
    # regular files in staging.  The resolve check is defense-in-depth
    # for future copy mechanism changes; see
    # TestValidateEntryPointResolved.test_symlink_escape_rejected for
    # the unit-level proof that the function catches symlink escapes.


================================================
FILE: tests/test_experiment_diagnosis.py
================================================
"""Tests for experiment_diagnosis — failure analysis agent."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.pipeline.experiment_diagnosis import (
    DeficiencyType,
    ExperimentDiagnosis,
    ExperimentQualityAssessment,
    PaperMode,
    assess_experiment_quality,
    diagnose_experiment,
)

ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts"


# ---------------------------------------------------------------------------
# Unit tests — individual checks
# ---------------------------------------------------------------------------


class TestMissingDependency:
    def test_detects_module_not_found(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stdout="",
            stderr="ModuleNotFoundError: No module named 'utils'",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.MISSING_DEPENDENCY in types

    def test_detects_box2d(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stdout="BOX2D_WARNING: Box2D/LunarLander-v3 not available; skipping",
            stderr="",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.MISSING_DEPENDENCY in types


class TestPermissionError:
    def test_detects_hf_permission(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stdout="",
            stderr="PermissionError: Cannot download huggingface model",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.PERMISSION_ERROR in types


class TestTimeGuard:
    def test_detects_dominant_time_guard(self):
        summary = {
            "condition_summaries": {"CondA": {"metrics": {"metric": 80.0}}},
            "best_run": {"metrics": {}},
        }
        plan = {"conditions": [{"name": "CondA"}, {"name": "CondB"}, {"name": "CondC"}, {"name": "CondD"}]}
        diag = diagnose_experiment(
            experiment_summary=summary,
            experiment_plan=plan,
            stdout="TIME_GUARD: skipping CondB\nTIME_GUARD: skipping CondC\nTIME_GUARD: skipping CondD",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.TIME_GUARD_DOMINANT in types

    def test_no_time_guard_if_most_complete(self):
        summary = {
            "condition_summaries": {
                "A": {"metrics": {"metric": 1.0}},
                "B": {"metrics": {"metric": 2.0}},
                "C": {"metrics": {"metric": 3.0}},
            },
            "best_run": {"metrics": {}},
        }
        plan = {"conditions": [{"name": "A"}, {"name": "B"}, {"name": "C"}, {"name": "D"}]}
        diag = diagnose_experiment(experiment_summary=summary, experiment_plan=plan)
        types = {d.type for d in diag.deficiencies}
        # 1/4 skipped = 25%, below 50% threshold
        assert DeficiencyType.TIME_GUARD_DOMINANT not in types


class TestSyntheticData:
    def test_detects_synthetic_fallback(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stdout="[data] WARNING: Alpaca load failed ... using synthetic data.",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.SYNTHETIC_DATA_FALLBACK in types


class TestGPUOOM:
    def test_detects_oom(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stderr="RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB",
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.GPU_OOM in types


class TestIdenticalConditions:
    def test_detects_from_ablation_warnings(self):
        summary = {
            "condition_summaries": {"A": {"metrics": {"m": 1}}, "B": {"metrics": {"m": 1}}},
            "best_run": {"metrics": {}},
            "ablation_warnings": [
                "ABLATION FAILURE: Conditions 'A' and 'B' produce identical outputs across all 1 metrics."
            ],
        }
        diag = diagnose_experiment(experiment_summary=summary)
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.IDENTICAL_CONDITIONS in types


class TestCodeCrash:
    def test_detects_traceback(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stderr=(
                "Traceback (most recent call last):\n"
                "  File 'main.py', line 42, in main\n"
                "    result = train(model)\n"
                "TypeError: train() missing argument 'data'\n"
            ),
        )
        types = {d.type for d in diag.deficiencies}
        assert DeficiencyType.CODE_CRASH in types


# ---------------------------------------------------------------------------
# Quality assessment
# ---------------------------------------------------------------------------


class TestQualityAssessment:
    def test_full_paper_mode(self):
        summary = {
            "condition_summaries": {
                "A": {"metrics": {"metric": 80.0}},
                "B": {"metrics": {"metric": 85.0}},
                "C": {"metrics": {"metric": 90.0}},
            },
            "best_run": {
                "metrics": {
                    "A/0/m": 80.0, "A/1/m": 81.0,
                    "B/0/m": 85.0, "B/1/m": 86.0,
                    "C/0/m": 90.0, "C/1/m": 91.0,
                },
            },
        }
        qa = assess_experiment_quality(summary)
        assert qa.mode == PaperMode.FULL_PAPER
        assert qa.sufficient

    def test_preliminary_study_mode(self):
        summary = {
            "condition_summaries": {
                "A": {"metrics": {"metric": 80.0}},
                "B": {"metrics": {"metric": 85.0}},
            },
            "best_run": {"metrics": {"A/0/m": 80.0, "B/0/m": 85.0}},
        }
        qa = assess_experiment_quality(summary)
        assert qa.mode == PaperMode.PRELIMINARY_STUDY
        assert not qa.sufficient

    def test_technical_report_no_conditions(self):
        summary = {
            "condition_summaries": {},
            "best_run": {"metrics": {}},
        }
        qa = assess_experiment_quality(summary)
        assert qa.mode == PaperMode.TECHNICAL_REPORT
        assert not qa.sufficient

    def test_technical_report_synthetic_data(self):
        summary = {
            "condition_summaries": {"A": {"metrics": {"metric": 80.0}}},
            "best_run": {"metrics": {}, "stdout": "using synthetic data"},
        }
        qa = assess_experiment_quality(summary)
        assert qa.mode == PaperMode.TECHNICAL_REPORT


# ---------------------------------------------------------------------------
# Repair prompt generation
# ---------------------------------------------------------------------------


class TestRepairPrompt:
    def test_generates_prompt(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stderr="ModuleNotFoundError: No module named 'special_lib'",
        )
        prompt = diag.to_repair_prompt()
        assert "special_lib" in prompt
        assert "DIAGNOSIS" in prompt
        assert "CRITICAL" in prompt

    def test_serialization(self):
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {"A": {"metrics": {"m": 1}}}, "best_run": {"metrics": {}}},
        )
        d = diag.to_dict()
        assert isinstance(d, dict)
        assert "deficiencies" in d
        assert "conditions_completed" in d


# ---------------------------------------------------------------------------
# Integration — real artifacts
# ---------------------------------------------------------------------------


class TestRealArtifacts:
    def _load(self, run_id: str) -> tuple[dict, dict | None]:
        pattern = f"rc-*-{run_id}"
        matches = sorted(ARTIFACTS.glob(pattern))
        if not matches:
            pytest.skip(f"Artifact {run_id} not found")
        base = matches[0]
        summary_path = base / "stage-14" / "experiment_summary.json"
        ref_path = base / "stage-13" / "refinement_log.json"
        if not summary_path.exists():
            pytest.skip(f"No experiment_summary for {run_id}")
        summary = json.loads(summary_path.read_text())
        ref_log = json.loads(ref_path.read_text()) if ref_path.exists() else None
        return summary, ref_log

    def test_run_e57360_diagnosis(self):
        """Run 38 — 3/8 conditions completed, Box2D missing."""
        summary, ref_log = self._load("e57360")
        qa = assess_experiment_quality(summary, ref_log)
        # Should identify issues and NOT rate as full_paper
        assert qa.mode != PaperMode.FULL_PAPER or len(qa.deficiencies) > 0

    def test_run_8b4a1b_diagnosis(self):
        """Run 8b4a1b — all NaN, permission errors."""
        summary, ref_log = self._load("8b4a1b")
        qa = assess_experiment_quality(summary, ref_log)
        # Should be technical_report or preliminary_study at best
        assert qa.mode in (PaperMode.TECHNICAL_REPORT, PaperMode.PRELIMINARY_STUDY)

class TestDatasetNotFoundError:
    """BUG-203: HuggingFace DatasetNotFoundError should be caught."""

    def test_detects_hf_dataset_not_found(self):
        stderr = (
            "Traceback (most recent call last):\n"
            "  File \"/workspace/setup.py\", line 11, in main\n"
            "datasets.exceptions.DatasetNotFoundError: "
            "Dataset 'cifar10_corrupted' doesn't exist on the Hub or cannot be accessed.\n"
        )
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stderr=stderr,
        )
        ds_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.DATASET_UNAVAILABLE]
        assert len(ds_issues) >= 1
        assert "HuggingFace" in ds_issues[0].description
        # Should NOT also appear as a generic CODE_CRASH
        crashes = [d for d in diag.deficiencies if d.type == DeficiencyType.CODE_CRASH]
        assert not any("DatasetNotFoundError" in c.description for c in crashes)

    def test_suggested_fix_mentions_precached(self):
        stderr = (
            "DatasetNotFoundError: Dataset 'imagenet_v2' "
            "doesn't exist on the Hub or cannot be accessed.\n"
        )
        diag = diagnose_experiment(
            experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}},
            stderr=stderr,
        )
        ds_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.DATASET_UNAVAILABLE]
        assert any("/opt/datasets" in d.suggested_fix for d in ds_issues)


class TestNearRandomAccuracy:
    """BUG-204: Detect near-random accuracy in experiment results."""

    def test_detects_near_random_cifar10(self):
        """8.91% accuracy on CIFAR-10 should be flagged."""
        diag = diagnose_experiment(
            experiment_summary={
                "condition_summaries": {"cond_a": {"metrics": {"top1_accuracy": 8.91}}},
                "metrics_summary": {"top1_accuracy": {"min": 8.42, "max": 8.91, "mean": 8.67}},
                "best_run": {"metrics": {}},
            },
        )
        hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE]
        assert any("random chance" in d.description for d in hp_issues)

    def test_normal_accuracy_not_flagged(self):
        """73% accuracy should NOT be flagged."""
        diag = diagnose_experiment(
            experiment_summary={
                "condition_summaries": {"baseline": {"metrics": {"accuracy": 73.07}}},
                "metrics_summary": {"accuracy": {"min": 68.0, "max": 73.07, "mean": 70.5}},
                "best_run": {"metrics": {}},
            },
        )
        hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE]
        assert not any("random chance" in d.description for d in hp_issues)

    def test_zero_accuracy_not_flagged(self):
        """0% accuracy (no data) should NOT be flagged by this check."""
        diag = diagnose_experiment(
            experiment_summary={
                "condition_summaries": {},
                "metrics_summary": {},
                "best_run": {"metrics": {}},
            },
        )
        hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE]
        assert not any("random chance" in d.description for d in hp_issues)


class TestRealArtifactsContinued(TestRealArtifacts):
    """Continuation of real artifact tests (after TestDatasetNotFoundError)."""

    def test_run_acbdfa_diagnosis(self):
        """Run acbdfa — 2 architectures, S4D nearly random."""
        summary, ref_log = self._load("acbdfa")
        diag = diagnose_experiment(
            experiment_summary=summary,
            refinement_log=ref_log,
            stdout=summary.get("best_run", {}).get("stdout", ""),
            stderr=summary.get("best_run", {}).get("stderr", ""),
        )
        assert diag.completion_rate > 0


================================================
FILE: tests/test_experiment_repair.py
================================================
"""Tests for experiment_repair — repair loop and prompt generation."""

from __future__ import annotations

import json
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.pipeline.experiment_diagnosis import (
    DeficiencyType,
    Deficiency,
    ExperimentDiagnosis,
    PaperMode,
)
from researchclaw.pipeline.experiment_repair import (
    ExperimentRepairResult,
    RepairCycleResult,
    build_repair_prompt,
    run_repair_loop,
    select_best_results,
    _extract_code_blocks,
    _build_experiment_summary_from_run,
    _load_experiment_code,
    _load_experiment_summary,
    _summary_quality_score,
)


# ---------------------------------------------------------------------------
# build_repair_prompt tests
# ---------------------------------------------------------------------------


class TestBuildRepairPrompt:
    def test_basic_prompt(self):
        diag = ExperimentDiagnosis(
            deficiencies=[
                Deficiency(
                    type=DeficiencyType.MISSING_DEPENDENCY,
                    severity="critical",
                    description="Missing Python package: utils",
                    suggested_fix="Add 'utils' to requirements.txt",
                )
            ],
            conditions_completed=["CondA"],
            conditions_failed=["CondB"],
            total_planned=2,
            completion_rate=0.5,
            summary="1 deficiency. 1/2 conditions completed.",
        )
        prompt = build_repair_prompt(
            diagnosis=diag,
            original_code={"main.py": "import utils\nprint('hello')"},
            time_budget_sec=2400,
        )
        assert "EXPERIMENT REPAIR TASK" in prompt
        assert "utils" in prompt
        assert "main.py" in prompt
        assert "2400" in prompt

    def test_scope_reduction_included(self):
        diag = ExperimentDiagnosis(
            deficiencies=[
                Deficiency(
                    type=DeficiencyType.TIME_GUARD_DOMINANT,
                    severity="major",
                    description="Time guard killed 8/10 conditions",
                    affected_conditions=["C3", "C4", "C5"],
                    suggested_fix="Reduce conditions",
                )
            ],
            conditions_completed=["C1", "C2"],
            conditions_failed=["C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"],
            total_planned=10,
            completion_rate=0.2,
        )
        prompt = build_repair_prompt(diag, original_code={})
        assert "SCOPE REDUCTION" in prompt
        assert "BASELINE" in prompt

    def test_dep_fix_section(self):
        diag = ExperimentDiagnosis(
            deficiencies=[
                Deficiency(
                    type=DeficiencyType.MISSING_DEPENDENCY,
                    severity="critical",
                    description="Missing Python package: box2d-py",
                    suggested_fix="Add 'box2d-py' to requirements.txt",
                ),
            ],
        )
        prompt = build_repair_prompt(diag, original_code={})
        assert "DEPENDENCY FIXES" in prompt
        assert "box2d-py" in prompt

    def test_long_code_truncated(self):
        long_code = "x = 1\n" * 5000
        diag = ExperimentDiagnosis()
        prompt = build_repair_prompt(diag, original_code={"big.py": long_code})
        assert "truncated" in prompt

    def test_output_format_section(self):
        diag = ExperimentDiagnosis()
        prompt = build_repair_prompt(diag, original_code={"main.py": "pass"})
        assert "OUTPUT FORMAT" in prompt
        assert "filename.py" in prompt


# ---------------------------------------------------------------------------
# ExperimentRepairResult tests
# ---------------------------------------------------------------------------


class TestRepairResult:
    def test_serialization(self):
        result = ExperimentRepairResult(
            success=False,
            total_cycles=2,
            final_mode=PaperMode.PRELIMINARY_STUDY,
        )
        d = result.to_dict()
        assert d["success"] is False
        assert d["total_cycles"] == 2
        assert d["final_mode"] == "preliminary_study"

    def test_serialization_with_cycles(self):
        diag = ExperimentDiagnosis(summary="test")
        result = ExperimentRepairResult(
            success=True,
            total_cycles=1,
            final_mode=PaperMode.FULL_PAPER,
            cycle_history=[
                RepairCycleResult(
                    cycle=1,
                    diagnosis=diag,
                    repair_applied=True,
                    repair_description="Fixed 2 files",
                ),
            ],
        )
        d = result.to_dict()
        assert d["success"] is True
        assert len(d["cycle_history"]) == 1
        assert d["cycle_history"][0]["repair_applied"] is True
        assert d["cycle_history"][0]["diagnosis_summary"] == "test"


# ---------------------------------------------------------------------------
# Code extraction tests
# ---------------------------------------------------------------------------


class TestExtractCodeBlocks:
    def test_named_blocks(self):
        text = """Here are the fixed files:

```python main.py
import torch
print("hello")
```

```python requirements.txt
torch>=2.0
numpy
```
"""
        files = _extract_code_blocks(text)
        assert "main.py" in files
        assert "requirements.txt" in files
        assert "torch" in files["main.py"]
        assert "numpy" in files["requirements.txt"]

    def test_unnamed_block_fallback(self):
        text = """```python
import torch
model = torch.nn.Linear(10, 2)
print("condition=Baseline metric=0.95")
```"""
        files = _extract_code_blocks(text)
        assert "main.py" in files
        assert "torch" in files["main.py"]

    def test_no_blocks(self):
        text = "No code here, just text."
        files = _extract_code_blocks(text)
        assert files == {}

    def test_path_normalization(self):
        text = """```python src/models/main.py
import torch
print("hello world, this is a test of the extraction")
```"""
        files = _extract_code_blocks(text)
        assert "main.py" in files


# ---------------------------------------------------------------------------
# Summary building tests
# ---------------------------------------------------------------------------


class TestBuildExperimentSummary:
    def test_basic_summary(self):
        run_result = {
            "stdout": "condition=Baseline metric=80.0\ncondition=Proposed metric=90.0",
            "stderr": "",
            "returncode": 0,
            "metrics": {
                "Baseline/0/accuracy": 80.0,
                "Proposed/0/accuracy": 90.0,
                "primary_metric": 90.0,
            },
            "elapsed_sec": 120.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {"main.py": "pass"})
        assert "condition_summaries" in summary
        assert "Baseline" in summary["condition_summaries"]
        assert "Proposed" in summary["condition_summaries"]
        assert summary["total_conditions"] == 2
        assert summary["best_run"]["status"] == "completed"

    def test_failed_run(self):
        run_result = {
            "stdout": "",
            "stderr": "Error: crash",
            "returncode": 1,
            "metrics": {},
            "elapsed_sec": 5.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        assert summary["best_run"]["status"] == "failed"
        assert summary["total_conditions"] == 0

    def test_multi_seed_grouping(self):
        run_result = {
            "stdout": "",
            "stderr": "",
            "returncode": 0,
            "metrics": {
                "Baseline/0/accuracy": 80.0,
                "Baseline/1/accuracy": 82.0,
                "Proposed/0/accuracy": 90.0,
                "Proposed/1/accuracy": 92.0,
            },
            "elapsed_sec": 300.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        assert len(summary["condition_summaries"]) == 2
        # Mean of 80.0 and 82.0
        bl = summary["condition_summaries"]["Baseline"]
        assert abs(bl["metrics"]["accuracy"] - 81.0) < 0.01
        assert bl["n_seeds"] == 2


# ---------------------------------------------------------------------------
# File loading tests
# ---------------------------------------------------------------------------


class TestLoadExperimentCode:
    def test_loads_from_stage_13(self, tmp_path):
        exp_dir = tmp_path / "stage-13" / "experiment_final"
        exp_dir.mkdir(parents=True)
        (exp_dir / "main.py").write_text("print('hello')")
        (exp_dir / "requirements.txt").write_text("torch")

        code = _load_experiment_code(tmp_path)
        assert "main.py" in code
        assert "requirements.txt" in code

    def test_loads_from_stage_10(self, tmp_path):
        exp_dir = tmp_path / "stage-10" / "experiment"
        exp_dir.mkdir(parents=True)
        (exp_dir / "main.py").write_text("print('hello')")

        code = _load_experiment_code(tmp_path)
        assert "main.py" in code

    def test_empty_when_no_code(self, tmp_path):
        code = _load_experiment_code(tmp_path)
        assert code == {}


class TestLoadExperimentSummary:
    def test_loads_summary(self, tmp_path):
        stage_dir = tmp_path / "stage-14"
        stage_dir.mkdir()
        summary = {"condition_summaries": {"A": {}}}
        (stage_dir / "experiment_summary.json").write_text(json.dumps(summary))

        result = _load_experiment_summary(tmp_path)
        assert result is not None
        assert "A" in result["condition_summaries"]


# ---------------------------------------------------------------------------
# select_best_results tests
# ---------------------------------------------------------------------------


class TestSelectBestResults:
    def test_picks_best_across_cycles(self, tmp_path):
        # Original (1 condition)
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        (s14 / "experiment_summary.json").write_text(json.dumps({
            "condition_summaries": {"A": {}},
            "best_run": {"metrics": {}},
        }))

        # Repair v1 (3 conditions — better)
        r1 = tmp_path / "stage-14_repair_v1"
        r1.mkdir()
        (r1 / "experiment_summary.json").write_text(json.dumps({
            "condition_summaries": {"A": {}, "B": {}, "C": {}},
            "best_run": {"metrics": {"primary_metric": 90.0}},
        }))

        best = select_best_results(tmp_path, [])
        assert best is not None
        assert len(best["condition_summaries"]) == 3

    def test_returns_none_when_empty(self, tmp_path):
        result = select_best_results(tmp_path, [])
        assert result is None


# ---------------------------------------------------------------------------
# Full repair loop tests (mocked)
# ---------------------------------------------------------------------------


class TestRunRepairLoop:
    def _make_run_dir(self, tmp_path, n_conditions=1, has_code=True):
        """Create a minimal run directory for testing."""
        # Stage 14 — experiment summary
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        (s14 / "runs").mkdir()

        conds = {f"Cond{i}": {"metrics": {"accuracy": 70.0 + i}} for i in range(n_conditions)}
        summary = {
            "condition_summaries": conds,
            "best_run": {"metrics": {f"Cond{i}/0/accuracy": 70.0 + i for i in range(n_conditions)}},
            "metrics_summary": {"accuracy": {"mean": 70.5}},
        }
        (s14 / "experiment_summary.json").write_text(json.dumps(summary))

        run_data = {
            "stdout": "\n".join(f"condition=Cond{i} metric={70.0 + i}" for i in range(n_conditions)),
            "stderr": "",
        }
        (s14 / "runs" / "run_0.json").write_text(json.dumps(run_data))

        # Stage 10 — experiment code
        if has_code:
            s10 = tmp_path / "stage-10" / "experiment"
            s10.mkdir(parents=True)
            (s10 / "main.py").write_text("import torch\nprint('hello')")

        return tmp_path

    def test_skips_when_already_sufficient(self, tmp_path):
        """If experiment is already sufficient, return immediately."""
        # 3 conditions with 2+ seeds = full_paper
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        (s14 / "runs").mkdir()
        summary = {
            "condition_summaries": {
                "A": {"metrics": {"m": 80.0}},
                "B": {"metrics": {"m": 85.0}},
                "C": {"metrics": {"m": 90.0}},
            },
            "best_run": {
                "metrics": {
                    "A/0/m": 80.0, "A/1/m": 81.0,
                    "B/0/m": 85.0, "B/1/m": 86.0,
                    "C/0/m": 90.0, "C/1/m": 91.0,
                },
            },
        }
        (s14 / "experiment_summary.json").write_text(json.dumps(summary))

        from researchclaw.config import ExperimentConfig, ExperimentRepairConfig

        class FakeConfig:
            class experiment:
                time_budget_sec = 2400
                repair = ExperimentRepairConfig(enabled=True)

            class llm:
                pass

        result = run_repair_loop(tmp_path, FakeConfig(), "test")
        assert result.success is True
        assert result.total_cycles == 0
        assert result.final_mode == PaperMode.FULL_PAPER

    def test_returns_failure_when_no_code(self, tmp_path):
        """If no experiment code found, return failure."""
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        (s14 / "experiment_summary.json").write_text(json.dumps({
            "condition_summaries": {"A": {"metrics": {"m": 80.0}}},
            "best_run": {"metrics": {}},
        }))

        from researchclaw.config import ExperimentRepairConfig

        class FakeConfig:
            class experiment:
                time_budget_sec = 2400
                repair = ExperimentRepairConfig(enabled=True)

            class llm:
                pass

        result = run_repair_loop(tmp_path, FakeConfig(), "test")
        assert result.success is False
        assert result.total_cycles == 0

    def test_repair_loop_with_mocked_llm(self, tmp_path):
        """Test full repair loop with mocked LLM and sandbox."""
        run_dir = self._make_run_dir(tmp_path, n_conditions=1)

        from researchclaw.config import ExperimentRepairConfig, ExperimentConfig, OpenCodeConfig

        class FakeConfig:
            class experiment:
                time_budget_sec = 2400
                mode = "simulated"
                repair = ExperimentRepairConfig(enabled=True, max_cycles=1, use_opencode=False)
                opencode = OpenCodeConfig(enabled=False)
                metric_key = "primary_metric"

            class llm:
                pass

        # Mock the LLM to return fixed code
        mock_llm = MagicMock()
        mock_resp = MagicMock()
        mock_resp.content = """```python main.py
import torch
for cond in ["Baseline", "Proposed", "Ablation"]:
    for seed in range(2):
        acc = 80.0 + hash(cond) % 20 + seed
        print(f"condition={cond}/{seed}/accuracy metric={acc}")
print("condition=Baseline metric=80.0")
print("condition=Proposed metric=90.0")
print("condition=Ablation metric=85.0")
```"""
        mock_llm.chat.return_value = mock_resp

        # Mock sandbox to return good results
        mock_sandbox_result = MagicMock()
        mock_sandbox_result.stdout = (
            "condition=Baseline/0/accuracy metric=80.0\n"
            "condition=Baseline/1/accuracy metric=82.0\n"
            "condition=Proposed/0/accuracy metric=90.0\n"
            "condition=Proposed/1/accuracy metric=92.0\n"
            "condition=Ablation/0/accuracy metric=85.0\n"
            "condition=Ablation/1/accuracy metric=87.0\n"
        )
        mock_sandbox_result.stderr = ""
        mock_sandbox_result.returncode = 0
        mock_sandbox_result.metrics = {
            "Baseline/0/accuracy": 80.0, "Baseline/1/accuracy": 82.0,
            "Proposed/0/accuracy": 90.0, "Proposed/1/accuracy": 92.0,
            "Ablation/0/accuracy": 85.0, "Ablation/1/accuracy": 87.0,
        }
        mock_sandbox_result.elapsed_sec = 120.0
        mock_sandbox_result.timed_out = False

        mock_sandbox = MagicMock()
        mock_sandbox.run_project.return_value = mock_sandbox_result

        with patch("researchclaw.llm.create_llm_client") as mock_create_llm, \
             patch("researchclaw.experiment.factory.create_sandbox") as mock_create_sb:
            mock_create_llm.return_value = mock_llm
            mock_create_sb.return_value = mock_sandbox

            result = run_repair_loop(run_dir, FakeConfig(), "test-mock")

        assert result.total_cycles == 1
        assert len(result.cycle_history) == 1
        assert result.cycle_history[0].repair_applied is True

        # Check that repair files were saved
        repair_dir = run_dir / "stage-14_repair_v1"
        assert repair_dir.exists()
        assert (repair_dir / "experiment" / "main.py").exists()
        assert (repair_dir / "experiment_summary.json").exists()


# ---------------------------------------------------------------------------
# BUG-199: 2-part metric keys (condition/metric) in summary builder
# ---------------------------------------------------------------------------


class TestBuildExperimentSummaryTwoPartKeys:
    """BUG-199: Stage 13 refinement produces 2-part keys (condition/metric)
    instead of 3-part keys (condition/seed/metric).  The parser must handle
    both formats.
    """

    def test_two_part_keys_parsed(self):
        """2-part keys like 'Baseline/accuracy' should create conditions."""
        run_result = {
            "stdout": "",
            "stderr": "",
            "returncode": 0,
            "metrics": {
                "Baseline/accuracy": 0.85,
                "Proposed/accuracy": 0.94,
                "Ablation/accuracy": 0.88,
            },
            "elapsed_sec": 120.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        assert summary["total_conditions"] == 3
        assert "Baseline" in summary["condition_summaries"]
        assert "Proposed" in summary["condition_summaries"]
        assert "Ablation" in summary["condition_summaries"]
        assert summary["condition_summaries"]["Proposed"]["metrics"]["accuracy"] == 0.94

    def test_two_part_keys_create_synthetic_seed(self):
        """2-part keys should create a synthetic seed '0' entry."""
        run_result = {
            "stdout": "",
            "stderr": "",
            "returncode": 0,
            "metrics": {
                "Baseline/accuracy": 0.80,
                "Baseline/loss": 0.45,
            },
            "elapsed_sec": 60.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        bl = summary["condition_summaries"]["Baseline"]
        assert bl["metrics"]["accuracy"] == 0.80
        assert bl["metrics"]["loss"] == 0.45
        assert bl["n_seeds"] == 1  # synthetic seed "0"

    def test_mixed_two_and_three_part_keys(self):
        """Mix of 2-part and 3-part keys for different conditions."""
        run_result = {
            "stdout": "",
            "stderr": "",
            "returncode": 0,
            "metrics": {
                # 3-part keys (with seed)
                "Baseline/0/accuracy": 0.80,
                "Baseline/1/accuracy": 0.82,
                # 2-part keys (Stage 13 refinement output)
                "Proposed/accuracy": 0.94,
            },
            "elapsed_sec": 120.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        assert summary["total_conditions"] == 2
        # 3-part: mean of seeds
        bl = summary["condition_summaries"]["Baseline"]
        assert abs(bl["metrics"]["accuracy"] - 0.81) < 0.01
        assert bl["n_seeds"] == 2
        # 2-part: single value
        pr = summary["condition_summaries"]["Proposed"]
        assert pr["metrics"]["accuracy"] == 0.94
        assert pr["n_seeds"] == 1

    def test_empty_metrics_still_empty(self):
        """Empty metrics dict should still produce 0 conditions."""
        run_result = {
            "stdout": "",
            "stderr": "",
            "returncode": 1,
            "metrics": {},
            "elapsed_sec": 5.0,
            "timed_out": False,
        }
        summary = _build_experiment_summary_from_run(run_result, {})
        assert summary["total_conditions"] == 0


# ---------------------------------------------------------------------------
# BUG-198: Conditional promotion of repair summary in runner.py
# ---------------------------------------------------------------------------


class TestRepairSummaryPromotion:
    """BUG-198: runner.py should NOT overwrite a richer stage-14 summary
    with an empty/poorer repair result.
    """

    def test_empty_repair_does_not_overwrite_rich_summary(self, tmp_path):
        """Repair result with 0 conditions must NOT replace a summary
        that has real conditions and metrics.
        """
        # Create a rich existing stage-14 summary
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        rich_summary = {
            "condition_summaries": {
                "Baseline": {"metrics": {"accuracy": 0.80}},
                "Proposed": {"metrics": {"accuracy": 0.94}},
                "Ablation": {"metrics": {"accuracy": 0.88}},
            },
            "best_run": {
                "metrics": {
                    "Baseline/0/accuracy": 0.80,
                    "Proposed/0/accuracy": 0.94,
                    "primary_metric": 0.94,
                },
            },
            "total_conditions": 3,
            "total_metric_keys": 3,
        }
        (s14 / "experiment_summary.json").write_text(json.dumps(rich_summary))

        # Compute scores to verify the logic
        rich_score = _summary_quality_score(rich_summary)

        empty_summary = {
            "condition_summaries": {},
            "best_run": {"metrics": {}},
            "total_conditions": 0,
            "total_metric_keys": 0,
        }
        empty_score = _summary_quality_score(empty_summary)

        # The rich summary must score higher
        assert rich_score > empty_score

        # Verify that the existing file is preserved (simulate what runner does)
        existing = json.loads(
            (s14 / "experiment_summary.json").read_text(encoding="utf-8")
        )
        existing_score = _summary_quality_score(existing)
        repair_score = _summary_quality_score(empty_summary)

        # runner.py should NOT overwrite because repair_score <= existing_score
        assert repair_score <= existing_score
        # The file should still contain the rich data
        after = json.loads(
            (s14 / "experiment_summary.json").read_text(encoding="utf-8")
        )
        assert len(after["condition_summaries"]) == 3

    def test_richer_repair_does_overwrite(self, tmp_path):
        """Repair result with MORE conditions should replace a poorer summary."""
        s14 = tmp_path / "stage-14"
        s14.mkdir()
        poor_summary = {
            "condition_summaries": {"A": {"metrics": {"m": 0.5}}},
            "best_run": {"metrics": {}},
            "total_conditions": 1,
            "total_metric_keys": 0,
        }
        (s14 / "experiment_summary.json").write_text(json.dumps(poor_summary))

        rich_repair = {
            "condition_summaries": {
                "A": {"metrics": {"m": 0.80}},
                "B": {"metrics": {"m": 0.85}},
                "C": {"metrics": {"m": 0.90}},
            },
            "best_run": {"metrics": {"primary_metric": 0.90}},
            "total_conditions": 3,
            "total_metric_keys": 4,
        }

        poor_score = _summary_quality_score(poor_summary)
        rich_score = _summary_quality_score(rich_repair)
        assert rich_score > poor_score


================================================
FILE: tests/test_experiment_schema.py
================================================
"""Tests for the universal experiment schema."""

from __future__ import annotations

import pytest
import yaml

from researchclaw.domains.experiment_schema import (
    Condition,
    ConditionRole,
    EvaluationSpec,
    ExperimentType,
    MetricSpec,
    UniversalExperimentPlan,
    from_legacy_exp_plan,
)


# ---------------------------------------------------------------------------
# Condition tests
# ---------------------------------------------------------------------------


class TestCondition:
    def test_default_role(self):
        c = Condition(name="test")
        assert c.role == ConditionRole.PROPOSED.value

    def test_custom_role(self):
        c = Condition(name="baseline_method", role=ConditionRole.REFERENCE.value)
        assert c.role == "reference"

    def test_variant_with_parent(self):
        c = Condition(
            name="ablation_no_attn",
            role=ConditionRole.VARIANT.value,
            varies_from="proposed_method",
            variation="remove_attention",
        )
        assert c.varies_from == "proposed_method"


# ---------------------------------------------------------------------------
# UniversalExperimentPlan tests
# ---------------------------------------------------------------------------


class TestUniversalExperimentPlan:
    def test_empty_plan(self):
        plan = UniversalExperimentPlan()
        assert plan.conditions == []
        assert plan.experiment_type == "comparison"

    def test_plan_with_conditions(self):
        plan = UniversalExperimentPlan(
            experiment_type="comparison",
            conditions=[
                Condition(name="baseline", role="reference"),
                Condition(name="proposed", role="proposed"),
                Condition(name="ablation", role="variant", varies_from="proposed"),
            ],
        )
        assert len(plan.references) == 1
        assert len(plan.proposed) == 1
        assert len(plan.variants) == 1

    def test_to_legacy_format(self):
        plan = UniversalExperimentPlan(
            conditions=[
                Condition(name="ResNet-18", role="reference", description="Standard baseline"),
                Condition(name="OurMethod", role="proposed", description="Our new method"),
                Condition(name="OurMethod-NoAttn", role="variant", varies_from="OurMethod"),
            ],
            evaluation=EvaluationSpec(
                primary_metric=MetricSpec(name="accuracy", direction="maximize"),
            ),
        )
        legacy = plan.to_legacy_format()
        assert len(legacy["baselines"]) == 1
        assert legacy["baselines"][0]["name"] == "ResNet-18"
        assert len(legacy["proposed_methods"]) == 1
        assert len(legacy["ablations"]) == 1
        assert "accuracy" in legacy["metrics"]

    def test_to_yaml(self):
        plan = UniversalExperimentPlan(
            experiment_type="convergence",
            domain_id="physics_pde",
            conditions=[
                Condition(name="FD2", role="reference"),
                Condition(name="FD4", role="proposed"),
            ],
        )
        yaml_str = plan.to_yaml()
        data = yaml.safe_load(yaml_str)
        assert data["experiment"]["type"] == "convergence"
        assert data["experiment"]["domain"] == "physics_pde"
        assert len(data["experiment"]["conditions"]) == 2


# ---------------------------------------------------------------------------
# from_legacy_exp_plan tests
# ---------------------------------------------------------------------------


class TestFromLegacy:
    def test_basic_legacy_plan(self):
        legacy = {
            "baselines": [
                {"name": "ResNet-18", "description": "Standard CNN"},
            ],
            "proposed_methods": [
                {"name": "OurNet", "description": "Our new architecture"},
            ],
            "ablations": [
                {"name": "OurNet-NoSkip", "description": "Without skip connections"},
            ],
            "metrics": {
                "accuracy": {"direction": "maximize"},
            },
        }
        plan = from_legacy_exp_plan(legacy, domain_id="ml_vision")
        assert plan.domain_id == "ml_vision"
        assert len(plan.references) == 1
        assert plan.references[0].name == "ResNet-18"
        assert len(plan.proposed) == 1
        assert len(plan.variants) == 1
        assert plan.evaluation.primary_metric.name == "accuracy"
        assert plan.evaluation.primary_metric.direction == "maximize"

    def test_legacy_string_names(self):
        legacy = {
            "baselines": ["baseline_1", "baseline_2"],
            "proposed_methods": ["our_method"],
            "ablations": [],
        }
        plan = from_legacy_exp_plan(legacy)
        assert len(plan.references) == 2
        assert plan.references[0].name == "baseline_1"

    def test_legacy_yaml_string(self):
        yaml_str = """
baselines:
  - name: Euler
    description: Basic Euler method
proposed_methods:
  - name: RK4
    description: Runge-Kutta 4th order
metrics:
  convergence_order:
    direction: maximize
"""
        plan = from_legacy_exp_plan(yaml_str, domain_id="mathematics_numerical")
        assert plan.domain_id == "mathematics_numerical"
        assert len(plan.references) == 1
        assert plan.evaluation.primary_metric.name == "convergence_order"

    def test_roundtrip_legacy(self):
        """Test that converting to legacy and back preserves structure."""
        plan = UniversalExperimentPlan(
            conditions=[
                Condition(name="A", role="reference"),
                Condition(name="B", role="proposed"),
            ],
            evaluation=EvaluationSpec(
                primary_metric=MetricSpec(name="error", direction="minimize"),
            ),
        )
        legacy = plan.to_legacy_format()
        plan2 = from_legacy_exp_plan(legacy)
        assert len(plan2.references) == 1
        assert len(plan2.proposed) == 1
        assert plan2.evaluation.primary_metric.direction == "minimize"

    def test_empty_legacy(self):
        plan = from_legacy_exp_plan({})
        assert plan.conditions == []

    def test_metrics_as_list(self):
        legacy = {"metrics": ["accuracy", "f1"]}
        plan = from_legacy_exp_plan(legacy)
        assert plan.evaluation.primary_metric.name == "accuracy"


# ---------------------------------------------------------------------------
# Enum tests
# ---------------------------------------------------------------------------


class TestEnums:
    def test_condition_role_values(self):
        assert ConditionRole.REFERENCE.value == "reference"
        assert ConditionRole.PROPOSED.value == "proposed"
        assert ConditionRole.VARIANT.value == "variant"

    def test_experiment_type_values(self):
        assert ExperimentType.COMPARISON.value == "comparison"
        assert ExperimentType.CONVERGENCE.value == "convergence"
        assert ExperimentType.PROGRESSIVE_SPEC.value == "progressive_spec"


================================================
FILE: tests/test_figure_agent.py
================================================
"""Tests for the FigureAgent multi-agent chart generation system."""

from __future__ import annotations

import json
import os
import sys
import textwrap
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from unittest import mock

import pytest

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


@dataclass
class _FakeLLMResponse:
    content: str = ""
    model: str = "gpt-4.1"
    prompt_tokens: int = 100
    completion_tokens: int = 200
    total_tokens: int = 300


class _FakeLLM:
    """Minimal mock LLM client conforming to _LLMClientLike."""

    def __init__(self, response: str = "{}"):
        self._response = response
        self.calls: list[dict[str, Any]] = []

    def chat(self, messages, *, system=None, max_tokens=None,
             temperature=None, json_mode=False):
        self.calls.append({
            "messages": messages,
            "system": system,
            "json_mode": json_mode,
        })
        return _FakeLLMResponse(content=self._response)


# Sample experiment data for tests
_SAMPLE_CONDITIONS = {
    "proposed_method": {
        "metrics": {
            "primary_metric": 0.85,
            "primary_metric_mean": 0.85,
            "primary_metric_std": 0.02,
            "secondary_metric": 0.72,
        },
        "ci95_low": 0.83,
        "ci95_high": 0.87,
        "n_seeds": 3,
    },
    "baseline_resnet": {
        "metrics": {
            "primary_metric": 0.78,
            "primary_metric_mean": 0.78,
            "primary_metric_std": 0.03,
            "secondary_metric": 0.65,
        },
        "ci95_low": 0.75,
        "ci95_high": 0.81,
        "n_seeds": 3,
    },
    "ablation_no_attention": {
        "metrics": {
            "primary_metric": 0.80,
            "primary_metric_mean": 0.80,
            "primary_metric_std": 0.02,
            "secondary_metric": 0.68,
        },
        "ci95_low": 0.78,
        "ci95_high": 0.82,
        "n_seeds": 3,
    },
}

_SAMPLE_METRICS_SUMMARY = {
    "primary_metric": {"mean": 0.81, "min": 0.78, "max": 0.85, "count": 3},
    "secondary_metric": {"mean": 0.68, "min": 0.65, "max": 0.72, "count": 3},
}


# =========================================================================
# Style Config tests
# =========================================================================

class TestStyleConfig:
    def test_constants_exist(self):
        from researchclaw.agents.figure_agent.style_config import (
            COLORS_BRIGHT, DPI_PUBLICATION, FIGURE_WIDTH,
            MATPLOTLIB_STYLES, OUTPUT_FORMAT_PRIMARY,
        )
        assert len(COLORS_BRIGHT) >= 7
        assert DPI_PUBLICATION >= 300
        assert "single_column" in FIGURE_WIDTH
        assert "double_column" in FIGURE_WIDTH
        assert len(MATPLOTLIB_STYLES) >= 1
        assert OUTPUT_FORMAT_PRIMARY in ("pdf", "png")

    def test_get_style_preamble(self):
        from researchclaw.agents.figure_agent.style_config import get_style_preamble
        preamble = get_style_preamble()
        assert "matplotlib" in preamble
        assert "plt" in preamble
        assert "COLORS" in preamble
        assert "300" in preamble

    def test_custom_dpi(self):
        from researchclaw.agents.figure_agent.style_config import get_style_preamble
        preamble = get_style_preamble(dpi=150)
        assert "150" in preamble


# =========================================================================
# Planner Agent tests
# =========================================================================

class TestPlannerAgent:
    def test_domain_detection_classification(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        assert agent._detect_domain("Image classification with CIFAR-10") == "classification"

    def test_domain_detection_rl(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        assert agent._detect_domain("Reinforcement learning with reward shaping") == "reinforcement_learning"

    def test_domain_detection_default(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        assert agent._detect_domain("Quantum computing analysis") == "default"

    def test_analyze_data_basic(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        analysis = agent._analyze_data(
            results={},
            conditions=["proposed", "baseline", "ablation_no_x"],
            metrics_summary=_SAMPLE_METRICS_SUMMARY,
            condition_summaries=_SAMPLE_CONDITIONS,
            metric_key="primary_metric",
        )
        assert analysis["num_conditions"] == 3
        assert analysis["has_ablation"] is True
        assert analysis["has_per_condition_data"] is True
        assert analysis["has_multiple_seeds"] is True

    def test_analyze_data_training_history(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        analysis = agent._analyze_data(
            results={"training_history": [1.0, 0.5, 0.3]},
            conditions=["a"],
            metrics_summary={},
            condition_summaries={},
            metric_key="loss",
        )
        assert analysis["has_training_history"] is True

    def test_fallback_plan(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        agent = PlannerAgent(_FakeLLM())
        analysis = {
            "num_conditions": 3,
            "num_metrics": 2,
            "metric_names": ["primary_metric", "secondary_metric"],
            "has_training_history": False,
            "has_ablation": True,
            "has_multiple_seeds": True,
            "has_per_condition_data": True,
            "condition_values": {"proposed": 0.85, "baseline": 0.78},
        }
        figures = agent._fallback_plan("classification", analysis, "primary_metric", ["proposed", "baseline"])
        assert len(figures) >= 2
        types = {f["chart_type"] for f in figures}
        assert "bar_comparison" in types
        assert "ablation_grouped" in types

    def test_execute_with_llm_response(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        llm = _FakeLLM(json.dumps({
            "figures": [
                {
                    "figure_id": "fig_main",
                    "chart_type": "bar_comparison",
                    "title": "Main Results",
                    "caption": "Comparison of methods.",
                    "data_source": {"type": "condition_comparison", "metric": "primary_metric"},
                    "x_label": "Method",
                    "y_label": "Accuracy",
                    "width": "single_column",
                    "priority": 1,
                    "section": "results",
                },
                {
                    "figure_id": "fig_ablation",
                    "chart_type": "ablation_grouped",
                    "title": "Ablation",
                    "caption": "Component analysis.",
                    "data_source": {"type": "ablation_comparison", "metric": "primary_metric"},
                    "x_label": "Variant",
                    "y_label": "Accuracy",
                    "width": "single_column",
                    "priority": 1,
                    "section": "results",
                },
                {
                    "figure_id": "fig_heatmap",
                    "chart_type": "heatmap",
                    "title": "Metric Heatmap",
                    "caption": "Cross-metric analysis.",
                    "data_source": {"type": "multi_metric"},
                    "x_label": "Metric",
                    "y_label": "Method",
                    "width": "double_column",
                    "priority": 2,
                    "section": "analysis",
                },
            ]
        }))
        agent = PlannerAgent(llm, min_figures=3)
        result = agent.execute({
            "experiment_results": {},
            "topic": "Image classification with knowledge distillation",
            "metric_key": "primary_metric",
            "conditions": list(_SAMPLE_CONDITIONS.keys()),
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "condition_summaries": _SAMPLE_CONDITIONS,
        })
        assert result.success
        assert len(result.data["figures"]) == 3

    def test_execute_fallback_on_empty_llm(self):
        from researchclaw.agents.figure_agent.planner import PlannerAgent
        llm = _FakeLLM("{}")  # Empty response
        agent = PlannerAgent(llm, min_figures=2)
        result = agent.execute({
            "experiment_results": {},
            "topic": "Image classification",
            "metric_key": "primary_metric",
            "conditions": list(_SAMPLE_CONDITIONS.keys()),
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "condition_summaries": _SAMPLE_CONDITIONS,
        })
        assert result.success
        assert len(result.data["figures"]) >= 2


# =========================================================================
# CodeGen Agent tests
# =========================================================================

class TestCodeGenAgent:
    def test_template_bar_comparison(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        agent = CodeGenAgent(_FakeLLM())
        result = agent.execute({
            "figures": [{
                "figure_id": "fig_main",
                "chart_type": "bar_comparison",
                "title": "Results",
                "caption": "Main results.",
                "data_source": {"type": "condition_comparison", "metric": "primary_metric"},
                "x_label": "Method",
                "y_label": "Accuracy",
                "width": "single_column",
                "section": "results",
            }],
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "output_dir": "charts",
        })
        assert result.success
        scripts = result.data["scripts"]
        assert len(scripts) == 1
        script = scripts[0]["script"]
        assert "0.85" in script  # proposed_method value
        assert "0.78" in script  # baseline value
        assert "savefig" in script

    def test_template_grouped_bar(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        agent = CodeGenAgent(_FakeLLM())
        result = agent.execute({
            "figures": [{
                "figure_id": "fig_multi",
                "chart_type": "grouped_bar",
                "title": "Multi-metric",
                "caption": "Multi-metric comparison.",
                "data_source": {
                    "type": "multi_metric",
                    "metrics": ["primary_metric", "secondary_metric"],
                },
                "x_label": "Method",
                "y_label": "Score",
                "width": "double_column",
                "section": "analysis",
            }],
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "output_dir": "charts",
        })
        assert result.success
        scripts = result.data["scripts"]
        assert len(scripts) == 1
        assert "secondary_metric" in scripts[0]["script"]

    def test_template_heatmap(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        agent = CodeGenAgent(_FakeLLM())
        result = agent.execute({
            "figures": [{
                "figure_id": "fig_heat",
                "chart_type": "heatmap",
                "title": "Heatmap",
                "caption": "Analysis.",
                "data_source": {"type": "heatmap"},
                "x_label": "Metric",
                "y_label": "Method",
                "width": "double_column",
                "section": "analysis",
            }],
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "output_dir": "charts",
        })
        assert result.success
        scripts = result.data["scripts"]
        assert len(scripts) == 1
        assert "imshow" in scripts[0]["script"]

    def test_llm_fallback_for_unknown_type(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        llm = _FakeLLM("```python\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots()\nax.plot([1,2,3])\nfig.savefig('charts/fig_custom.png')\nplt.close(fig)\n```")
        agent = CodeGenAgent(llm)
        result = agent.execute({
            "figures": [{
                "figure_id": "fig_custom",
                "chart_type": "radar_chart",
                "title": "Radar",
                "caption": "Custom chart.",
                "data_source": {},
                "x_label": "X",
                "y_label": "Y",
                "width": "single_column",
                "section": "analysis",
            }],
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "output_dir": "charts",
        })
        assert result.success
        assert "matplotlib" in result.data["scripts"][0]["script"]

    def test_strip_fences(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        code = "```python\nprint('hello')\n```"
        assert CodeGenAgent._strip_fences(code) == "print('hello')"

    def test_strip_fences_no_fences(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        code = "print('hello')"
        assert CodeGenAgent._strip_fences(code) == "print('hello')"

    def test_multiple_figures(self):
        from researchclaw.agents.figure_agent.codegen import CodeGenAgent
        agent = CodeGenAgent(_FakeLLM())
        figures = [
            {
                "figure_id": f"fig_{i}",
                "chart_type": "bar_comparison",
                "title": f"Figure {i}",
                "caption": f"Caption {i}.",
                "data_source": {"type": "condition_comparison", "metric": "primary_metric"},
                "x_label": "X",
                "y_label": "Y",
                "width": "single_column",
                "section": "results",
            }
            for i in range(3)
        ]
        result = agent.execute({
            "figures": figures,
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "output_dir": "charts",
        })
        assert result.success
        assert len(result.data["scripts"]) == 3


# =========================================================================
# Renderer Agent tests
# =========================================================================

class TestRendererAgent:
    def test_render_simple_script(self, tmp_path):
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        agent = RendererAgent(_FakeLLM(), timeout_sec=10, use_docker=False)
        output_dir = tmp_path / "charts"

        # Use a script that creates a valid PNG without matplotlib
        # (creates a minimal 1x1 PNG file directly)
        script = textwrap.dedent("""\
            import struct, zlib
            output_path = "{output_dir}/fig_test.png"
            # Minimal valid PNG: 1x1 white pixel
            def write_png(path):
                sig = b'\\x89PNG\\r\\n\\x1a\\n'
                def chunk(ctype, data):
                    c = ctype + data
                    return struct.pack('>I', len(data)) + c + struct.pack('>I', zlib.crc32(c) & 0xffffffff)
                ihdr = struct.pack('>IIBBBBB', 1, 1, 8, 2, 0, 0, 0)
                raw = zlib.compress(b'\\x00\\xff\\xff\\xff')
                with open(path, 'wb') as f:
                    f.write(sig)
                    f.write(chunk(b'IHDR', ihdr))
                    f.write(chunk(b'IDAT', raw))
                    f.write(chunk(b'IEND', b''))
            write_png(output_path)
            # Pad file to meet minimum size requirement
            with open(output_path, 'ab') as f:
                f.write(b'\\x00' * 2048)
            print(f"Saved: {{output_path}}")
        """).format(output_dir=output_dir)

        result = agent.execute({
            "scripts": [{
                "figure_id": "fig_test",
                "script": script,
                "output_filename": "fig_test.png",
                "title": "Test",
                "caption": "Test chart",
                "section": "results",
            }],
            "output_dir": str(output_dir),
        })
        assert result.success
        rendered = result.data["rendered"]
        assert len(rendered) == 1
        assert rendered[0]["success"] is True
        assert Path(rendered[0]["output_path"]).exists()

    def test_render_syntax_error(self, tmp_path):
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        agent = RendererAgent(_FakeLLM(), timeout_sec=5)
        result = agent.execute({
            "scripts": [{
                "figure_id": "fig_bad",
                "script": "this is not valid python!!!",
                "output_filename": "fig_bad.png",
            }],
            "output_dir": str(tmp_path / "charts"),
        })
        # The renderer itself succeeds (returns results), but individual
        # figures have success=False
        rendered = result.data["rendered"]
        assert len(rendered) == 1
        assert rendered[0]["success"] is False
        assert rendered[0]["error"]

    def test_render_empty_script(self, tmp_path):
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        agent = RendererAgent(_FakeLLM(), timeout_sec=5)
        result = agent.execute({
            "scripts": [{
                "figure_id": "fig_empty",
                "script": "",
                "output_filename": "fig_empty.png",
            }],
            "output_dir": str(tmp_path / "charts"),
        })
        rendered = result.data["rendered"]
        assert rendered[0]["success"] is False
        assert "Empty" in rendered[0]["error"]

    def test_script_saved_for_reproducibility(self, tmp_path):
        from researchclaw.agents.figure_agent.renderer import RendererAgent
        agent = RendererAgent(_FakeLLM(), timeout_sec=5)
        output_dir = tmp_path / "charts"
        result = agent.execute({
            "scripts": [{
                "figure_id": "fig_save",
                "script": "print('hello')",
                "output_filename": "fig_save.png",
            }],
            "output_dir": str(output_dir),
        })
        # Script should be saved even if rendering fails
        script_path = output_dir / "scripts" / "fig_save.py"
        assert script_path.exists()
        assert script_path.read_text() == "print('hello')"


# =========================================================================
# Critic Agent tests
# =========================================================================

class TestCriticAgent:
    def test_numerical_accuracy_pass(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        llm = _FakeLLM(json.dumps({
            "quality_score": 8,
            "issues": [],
        }))
        agent = CriticAgent(llm)
        script = "values = [0.85, 0.78, 0.80]\nax.bar(x, values)\nfig.savefig('out.png')\nplt.close(fig)"
        issues = agent._check_numerical_accuracy(script, _SAMPLE_CONDITIONS, "primary_metric")
        # Values 0.85 and 0.78 are in script → should pass
        assert not any(i["severity"] == "critical" for i in issues)

    def test_numerical_accuracy_fail(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        agent = CriticAgent(_FakeLLM())
        script = "values = [0.99, 0.98, 0.97]"  # Wrong values
        issues = agent._check_numerical_accuracy(script, _SAMPLE_CONDITIONS, "primary_metric")
        assert any(i["severity"] == "critical" for i in issues)

    def test_text_correctness_missing_labels(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        agent = CriticAgent(_FakeLLM())
        script = "fig, ax = plt.subplots()\nax.bar([0], [1])"  # Missing labels + savefig
        issues = agent._check_text_correctness(script, {})
        types = {i["message"] for i in issues}
        assert any("x-axis" in t for t in types)
        assert any("savefig" in t for t in types)

    def test_text_correctness_all_present(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        agent = CriticAgent(_FakeLLM())
        script = (
            "ax.set_xlabel('X')\n"
            "ax.set_ylabel('Y')\n"
            "ax.set_title('T')\n"
            "fig.savefig('out.png')\n"
            "plt.close(fig)"
        )
        issues = agent._check_text_correctness(script, {})
        assert len(issues) == 0

    def test_visual_quality_llm_review(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        llm = _FakeLLM(json.dumps({
            "quality_score": 9,
            "issues": [],
        }))
        agent = CriticAgent(llm)
        issues = agent._check_visual_quality(
            "import matplotlib\nplt.figure()\nplt.savefig('x.png')",
            {"title": "Test"},
        )
        assert not any(i["severity"] == "critical" for i in issues)

    def test_visual_quality_low_score(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        llm = _FakeLLM(json.dumps({
            "quality_score": 3,
            "issues": [{"severity": "critical", "message": "Bad colors"}],
        }))
        agent = CriticAgent(llm)
        issues = agent._check_visual_quality("plt.plot([1,2])", {"title": "Bad"})
        assert any(i["severity"] == "critical" for i in issues)

    def test_execute_full_review(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        llm = _FakeLLM(json.dumps({
            "quality_score": 8,
            "issues": [],
        }))
        agent = CriticAgent(llm)
        result = agent.execute({
            "rendered": [
                {
                    "figure_id": "fig_1",
                    "success": True,
                    "output_path": "/tmp/fig.png",
                    "title": "Test",
                    "caption": "Test fig",
                },
            ],
            "scripts": [
                {
                    "figure_id": "fig_1",
                    "script": (
                        "values = [0.85, 0.78]\n"
                        "ax.set_xlabel('X')\nax.set_ylabel('Y')\n"
                        "ax.set_title('T')\nfig.savefig('x.png')\nplt.close(fig)"
                    ),
                },
            ],
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
        })
        assert result.success
        assert result.data["passed_count"] >= 0

    def test_review_failed_render(self):
        from researchclaw.agents.figure_agent.critic import CriticAgent
        agent = CriticAgent(_FakeLLM())
        result = agent.execute({
            "rendered": [
                {"figure_id": "fig_1", "success": False, "error": "Crash"},
            ],
            "scripts": [],
            "condition_summaries": {},
            "metrics_summary": {},
            "metric_key": "primary_metric",
        })
        assert result.success
        assert result.data["reviews"][0]["passed"] is False


# =========================================================================
# Integrator Agent tests
# =========================================================================

class TestIntegratorAgent:
    def test_build_manifest(self):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        agent = IntegratorAgent(_FakeLLM())
        rendered = [
            {
                "figure_id": "fig_main",
                "success": True,
                "output_path": "/tmp/charts/fig_main.png",
                "title": "Main Results",
                "caption": "Comparison.",
                "section": "results",
                "width": "single_column",
            },
            {
                "figure_id": "fig_ablation",
                "success": True,
                "output_path": "/tmp/charts/fig_ablation.png",
                "title": "Ablation",
                "caption": "Analysis.",
                "section": "results",
                "width": "single_column",
            },
        ]
        manifest = agent._build_manifest(rendered, Path("/tmp/charts"))
        assert len(manifest) == 2
        assert manifest[0]["figure_number"] == 1
        assert manifest[0]["paper_section"] == "Results"
        assert "charts/" in manifest[0]["file_path"]

    def test_generate_markdown_refs(self):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        agent = IntegratorAgent(_FakeLLM())
        manifest = [
            {
                "figure_number": 1,
                "file_path": "charts/fig_1.png",
                "caption": "Main results comparison",
            },
        ]
        refs = agent._generate_markdown_refs(manifest)
        assert "![Figure 1:" in refs
        assert "charts/fig_1.png" in refs

    def test_generate_descriptions(self):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        agent = IntegratorAgent(_FakeLLM())
        manifest = [
            {
                "figure_number": 1,
                "file_path": "charts/fig_1.png",
                "title": "Main Results",
                "caption": "Comparison",
                "paper_section": "Results",
            },
        ]
        desc = agent._generate_descriptions(manifest)
        assert "AVAILABLE FIGURES" in desc
        assert "Main Results" in desc
        assert "Results" in desc

    def test_execute_empty(self):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        agent = IntegratorAgent(_FakeLLM())
        result = agent.execute({
            "rendered": [],
            "topic": "Test",
            "output_dir": "/tmp/charts",
        })
        assert result.success
        assert result.data["figure_count"] == 0

    def test_execute_with_figures(self, tmp_path):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        agent = IntegratorAgent(_FakeLLM())
        output_dir = tmp_path / "charts"
        output_dir.mkdir()

        result = agent.execute({
            "rendered": [
                {
                    "figure_id": "fig_main",
                    "success": True,
                    "output_path": str(output_dir / "fig_main.png"),
                    "title": "Main",
                    "caption": "Main comparison.",
                    "section": "results",
                },
            ],
            "topic": "Test",
            "output_dir": str(output_dir),
        })
        assert result.success
        assert result.data["figure_count"] == 1
        assert (output_dir / "figure_manifest.json").exists()

    def test_section_ordering(self):
        from researchclaw.agents.figure_agent.integrator import IntegratorAgent
        assert IntegratorAgent._section_order("method") < IntegratorAgent._section_order("results")
        assert IntegratorAgent._section_order("results") < IntegratorAgent._section_order("analysis")


# =========================================================================
# Orchestrator tests
# =========================================================================

class TestOrchestrator:
    def test_orchestrate_basic(self, tmp_path):
        from researchclaw.agents.figure_agent.orchestrator import (
            FigureAgentConfig, FigureOrchestrator,
        )

        # LLM returns plan, then quality review
        responses = iter([
            json.dumps({
                "figures": [{
                    "figure_id": "fig_main",
                    "chart_type": "bar_comparison",
                    "title": "Main",
                    "caption": "Main comparison.",
                    "data_source": {"type": "condition_comparison", "metric": "primary_metric"},
                    "x_label": "Method",
                    "y_label": "Accuracy",
                    "width": "single_column",
                    "priority": 1,
                    "section": "results",
                }, {
                    "figure_id": "fig_ablation",
                    "chart_type": "ablation_grouped",
                    "title": "Ablation",
                    "caption": "Ablation study.",
                    "data_source": {"type": "ablation_comparison", "metric": "primary_metric"},
                    "x_label": "Variant",
                    "y_label": "Accuracy",
                    "width": "single_column",
                    "priority": 1,
                    "section": "results",
                }, {
                    "figure_id": "fig_heatmap",
                    "chart_type": "heatmap",
                    "title": "Heatmap",
                    "caption": "Metric heatmap.",
                    "data_source": {"type": "heatmap"},
                    "x_label": "Metric",
                    "y_label": "Method",
                    "width": "double_column",
                    "priority": 2,
                    "section": "analysis",
                }],
            }),
            # Critic review (called multiple times)
            json.dumps({"quality_score": 8, "issues": []}),
            json.dumps({"quality_score": 8, "issues": []}),
            json.dumps({"quality_score": 8, "issues": []}),
        ])

        class _MultiLLM:
            def __init__(self):
                self.calls = []
            def chat(self, messages, **kwargs):
                self.calls.append(messages)
                try:
                    resp = next(responses)
                except StopIteration:
                    resp = json.dumps({"quality_score": 8, "issues": []})
                return _FakeLLMResponse(content=resp)

        cfg = FigureAgentConfig(
            min_figures=3,
            max_figures=5,
            max_iterations=1,
            render_timeout_sec=10,
        )
        orch = FigureOrchestrator(_MultiLLM(), cfg, stage_dir=tmp_path)
        plan = orch.orchestrate({
            "experiment_results": {},
            "condition_summaries": _SAMPLE_CONDITIONS,
            "metrics_summary": _SAMPLE_METRICS_SUMMARY,
            "metric_key": "primary_metric",
            "conditions": list(_SAMPLE_CONDITIONS.keys()),
            "topic": "Image classification",
            "output_dir": str(tmp_path / "charts"),
        })

        assert plan.total_llm_calls > 0
        assert plan.elapsed_sec > 0
        # Plan should have chart files (some may fail rendering, that's OK)
        assert isinstance(plan.manifest, list)

    def test_figure_plan_serialization(self):
        from researchclaw.agents.figure_agent.orchestrator import FigurePlan
        plan = FigurePlan(
            manifest=[{"figure_number": 1, "file_path": "charts/fig.png"}],
            figure_count=1,
            passed_count=1,
        )
        d = plan.to_dict()
        assert d["figure_count"] == 1
        assert len(d["manifest"]) == 1

    def test_get_chart_files(self):
        from researchclaw.agents.figure_agent.orchestrator import FigurePlan
        plan = FigurePlan(
            manifest=[
                {"figure_number": 1, "file_path": "charts/fig_main.png"},
                {"figure_number": 2, "file_path": "charts/fig_ablation.png"},
            ],
        )
        files = plan.get_chart_files()
        assert files == ["fig_main.png", "fig_ablation.png"]


# =========================================================================
# Config tests
# =========================================================================

class TestFigureAgentConfig:
    def test_default_config(self):
        from researchclaw.config import FigureAgentConfig
        cfg = FigureAgentConfig()
        assert cfg.enabled is True
        assert cfg.min_figures == 3
        assert cfg.max_figures == 8
        assert cfg.max_iterations == 3
        assert cfg.dpi == 300
        assert cfg.strict_mode is False

    def test_parse_from_dict(self):
        from researchclaw.config import _parse_figure_agent_config
        cfg = _parse_figure_agent_config({
            "enabled": False,
            "min_figures": 2,
            "max_figures": 6,
            "dpi": 150,
        })
        assert cfg.enabled is False
        assert cfg.min_figures == 2
        assert cfg.max_figures == 6
        assert cfg.dpi == 150

    def test_parse_from_dict_extended_fields(self):
        from researchclaw.config import _parse_figure_agent_config
        cfg = _parse_figure_agent_config({
            "use_docker": False,
            "docker_image": "custom/figure:latest",
            "output_format": "latex",
            "gemini_api_key": "test-key",
            "gemini_model": "gemini-test",
            "nano_banana_enabled": False,
        })
        assert cfg.use_docker is False
        assert cfg.docker_image == "custom/figure:latest"
        assert cfg.output_format == "latex"
        assert cfg.gemini_api_key == "test-key"
        assert cfg.gemini_model == "gemini-test"
        assert cfg.nano_banana_enabled is False

    def test_parse_empty(self):
        from researchclaw.config import _parse_figure_agent_config
        cfg = _parse_figure_agent_config({})
        assert cfg.enabled is True
        assert cfg.min_figures == 3

    def test_experiment_config_has_figure_agent(self):
        from researchclaw.config import ExperimentConfig
        ec = ExperimentConfig()
        assert hasattr(ec, "figure_agent")
        assert ec.figure_agent.enabled is True


# =========================================================================
# Backward compatibility test
# =========================================================================

class TestBackwardCompatibility:
    def test_visualize_still_importable(self):
        """Old visualize.py functions should still be importable."""
        from researchclaw.experiment.visualize import (
            generate_all_charts,
            plot_condition_comparison,
            plot_experiment_comparison,
            plot_metric_trajectory,
        )
        assert callable(generate_all_charts)
        assert callable(plot_condition_comparison)
        assert callable(plot_experiment_comparison)
        assert callable(plot_metric_trajectory)

    def test_figure_agent_importable(self):
        from researchclaw.agents.figure_agent import FigureOrchestrator, FigurePlan
        assert FigureOrchestrator is not None
        assert FigurePlan is not None


================================================
FILE: tests/test_knowledge_graph.py
================================================
"""Tests for the research knowledge graph (20+ tests).

Covers:
- Entity/Relation CRUD
- Graph queries (gaps, trends, comparison)
- JSON serialization/deserialization
- Incremental updates
- Visualizer exports
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.knowledge.graph.entities import Entity, EntityType
from researchclaw.knowledge.graph.relations import Relation, RelationType
from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder
from researchclaw.knowledge.graph.query import KnowledgeGraphQuery
from researchclaw.knowledge.graph.visualizer import (
    export_to_dot,
    export_to_json_cytoscape,
    graph_summary,
)


# ── Fixtures ─────────────────────────────────────────────────────────


@pytest.fixture
def graph() -> KnowledgeGraphBuilder:
    return KnowledgeGraphBuilder(max_entities=100)


@pytest.fixture
def populated_graph(graph: KnowledgeGraphBuilder) -> KnowledgeGraphBuilder:
    # Papers
    graph.add_paper("p1", "ResNet: Deep Residual Learning", year=2016, authors=["He"])
    graph.add_paper("p2", "ViT: An Image is Worth 16x16 Words", year=2021, authors=["Dosovitskiy"])
    graph.add_paper("p3", "DeiT: Training Data-efficient Image Transformers", year=2021, authors=["Touvron"])

    # Methods
    graph.add_method("m1", "ResNet", description="Residual connections for deep networks")
    graph.add_method("m2", "Vision Transformer", description="Transformer for image classification")
    graph.add_method("m3", "Knowledge Distillation", description="Teacher-student learning")

    # Datasets
    graph.add_dataset("d1", "ImageNet", domain="computer vision")
    graph.add_dataset("d2", "CIFAR-10", domain="computer vision")
    graph.add_dataset("d3", "CIFAR-100", domain="computer vision")

    # Relations
    graph.add_relation(Relation("p2", "p1", RelationType.CITES))
    graph.add_relation(Relation("p3", "p2", RelationType.EXTENDS))
    graph.add_relation(Relation("p3", "p1", RelationType.CITES))
    graph.add_relation(Relation("m1", "d1", RelationType.USES_DATASET))
    graph.add_relation(Relation("m1", "d2", RelationType.USES_DATASET))
    graph.add_relation(Relation("m2", "d1", RelationType.USES_DATASET))
    graph.add_relation(Relation("m2", "d2", RelationType.USES_DATASET))
    graph.add_relation(Relation("p1", "m1", RelationType.APPLIES_METHOD))
    graph.add_relation(Relation("p2", "m2", RelationType.APPLIES_METHOD))
    graph.add_relation(Relation("m2", "m1", RelationType.OUTPERFORMS, {"dataset": "ImageNet"}))

    return graph


# ── Entity Tests ─────────────────────────────────────────────────────


class TestEntity:
    def test_create_entity(self) -> None:
        e = Entity("e1", EntityType.PAPER, "Test Paper")
        assert e.id == "e1"
        assert e.entity_type == EntityType.PAPER

    def test_to_dict(self) -> None:
        e = Entity("e1", EntityType.METHOD, "TestMethod", {"key": "val"})
        d = e.to_dict()
        assert d["entity_type"] == "method"
        assert d["attributes"]["key"] == "val"

    def test_from_dict(self) -> None:
        data = {"id": "x", "entity_type": "dataset", "name": "Test", "attributes": {}}
        e = Entity.from_dict(data)
        assert e.entity_type == EntityType.DATASET


class TestRelation:
    def test_create_relation(self) -> None:
        r = Relation("a", "b", RelationType.CITES)
        assert r.source_id == "a"
        assert r.target_id == "b"

    def test_to_dict(self) -> None:
        r = Relation("a", "b", RelationType.OUTPERFORMS, {"margin": 0.05})
        d = r.to_dict()
        assert d["relation_type"] == "outperforms"
        assert d["attributes"]["margin"] == 0.05

    def test_from_dict(self) -> None:
        data = {"source_id": "x", "target_id": "y", "relation_type": "extends"}
        r = Relation.from_dict(data)
        assert r.relation_type == RelationType.EXTENDS


# ── Builder Tests ────────────────────────────────────────────────────


class TestKnowledgeGraphBuilder:
    def test_add_entity(self, graph: KnowledgeGraphBuilder) -> None:
        e = Entity("e1", EntityType.PAPER, "Test")
        assert graph.add_entity(e)
        assert graph.entity_count == 1

    def test_add_duplicate_updates(self, graph: KnowledgeGraphBuilder) -> None:
        graph.add_entity(Entity("e1", EntityType.PAPER, "V1", {"a": 1}))
        graph.add_entity(Entity("e1", EntityType.PAPER, "V2", {"b": 2}))
        assert graph.entity_count == 1
        e = graph.get_entity("e1")
        assert e is not None
        assert e.name == "V2"
        assert e.attributes["a"] == 1  # merged
        assert e.attributes["b"] == 2

    def test_capacity_limit(self) -> None:
        g = KnowledgeGraphBuilder(max_entities=2)
        g.add_entity(Entity("e1", EntityType.PAPER, "P1"))
        g.add_entity(Entity("e2", EntityType.PAPER, "P2"))
        assert not g.add_entity(Entity("e3", EntityType.PAPER, "P3"))
        assert g.entity_count == 2

    def test_add_relation(self, graph: KnowledgeGraphBuilder) -> None:
        graph.add_entity(Entity("a", EntityType.PAPER, "A"))
        graph.add_entity(Entity("b", EntityType.PAPER, "B"))
        assert graph.add_relation(Relation("a", "b", RelationType.CITES))
        assert graph.relation_count == 1

    def test_add_relation_missing_entity(self, graph: KnowledgeGraphBuilder) -> None:
        graph.add_entity(Entity("a", EntityType.PAPER, "A"))
        assert not graph.add_relation(Relation("a", "missing", RelationType.CITES))

    def test_duplicate_relation(self, graph: KnowledgeGraphBuilder) -> None:
        graph.add_entity(Entity("a", EntityType.PAPER, "A"))
        graph.add_entity(Entity("b", EntityType.PAPER, "B"))
        graph.add_relation(Relation("a", "b", RelationType.CITES))
        graph.add_relation(Relation("a", "b", RelationType.CITES))  # duplicate
        assert graph.relation_count == 1

    def test_get_entities_by_type(self, populated_graph: KnowledgeGraphBuilder) -> None:
        papers = populated_graph.get_entities_by_type(EntityType.PAPER)
        assert len(papers) == 3

    def test_get_relations_for(self, populated_graph: KnowledgeGraphBuilder) -> None:
        rels = populated_graph.get_relations_for("p2")
        assert len(rels) >= 2  # outgoing + incoming

    def test_remove_entity(self, populated_graph: KnowledgeGraphBuilder) -> None:
        initial_rels = populated_graph.relation_count
        assert populated_graph.remove_entity("p1")
        assert populated_graph.get_entity("p1") is None
        assert populated_graph.relation_count < initial_rels

    def test_remove_nonexistent_entity(self, graph: KnowledgeGraphBuilder) -> None:
        assert not graph.remove_entity("nope")

    def test_convenience_methods(self, graph: KnowledgeGraphBuilder) -> None:
        paper = graph.add_paper("p1", "Test Paper", year=2024)
        method = graph.add_method("m1", "TestNet", description="A test")
        dataset = graph.add_dataset("d1", "TestSet", domain="cv")
        assert paper.entity_type == EntityType.PAPER
        assert method.entity_type == EntityType.METHOD
        assert dataset.entity_type == EntityType.DATASET


# ── Persistence ──────────────────────────────────────────────────────


class TestGraphPersistence:
    def test_save_and_load(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None:
        path = tmp_path / "graph.json"
        populated_graph.save(path)
        assert path.exists()

        new_graph = KnowledgeGraphBuilder()
        loaded = new_graph.load(path)
        assert loaded == populated_graph.entity_count
        assert new_graph.relation_count == populated_graph.relation_count

    def test_load_nonexistent(self, graph: KnowledgeGraphBuilder, tmp_path: Path) -> None:
        assert graph.load(tmp_path / "nope.json") == 0

    def test_load_malformed(self, graph: KnowledgeGraphBuilder, tmp_path: Path) -> None:
        path = tmp_path / "bad.json"
        path.write_text("not json", encoding="utf-8")
        assert graph.load(path) == 0


# ── Query Engine ─────────────────────────────────────────────────────


class TestKnowledgeGraphQuery:
    def test_find_research_gaps(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        gaps = query.find_research_gaps()
        # CIFAR-100 has no methods using it
        assert any("CIFAR-100" in g for g in gaps)

    def test_find_research_gaps_with_domain(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        gaps = query.find_research_gaps(domain="computer vision")
        assert isinstance(gaps, list)

    def test_find_trending_methods(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        trending = query.find_trending_methods(min_citations=1)
        assert len(trending) > 0

    def test_get_method_comparison(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        comparison = query.get_method_comparison("ResNet", "Vision Transformer")
        assert "method_a" in comparison
        assert "method_b" in comparison
        assert "shared_datasets" in comparison

    def test_get_method_comparison_not_found(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        comparison = query.get_method_comparison("NonexistentA", "NonexistentB")
        assert "error" in comparison

    def test_suggest_topics(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        topics = query.suggest_topics(["transformer", "vision"], top_k=3)
        assert isinstance(topics, list)

    def test_suggest_topics_empty_interests(self, populated_graph: KnowledgeGraphBuilder) -> None:
        query = KnowledgeGraphQuery(populated_graph)
        topics = query.suggest_topics([])
        assert isinstance(topics, list)


# ── Visualizer ───────────────────────────────────────────────────────


class TestVisualizer:
    def test_export_dot(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None:
        path = tmp_path / "graph.dot"
        export_to_dot(populated_graph, path)
        assert path.exists()
        content = path.read_text(encoding="utf-8")
        assert "digraph" in content
        assert "ResNet" in content

    def test_export_cytoscape(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None:
        path = tmp_path / "graph.json"
        export_to_json_cytoscape(populated_graph, path)
        assert path.exists()
        data = json.loads(path.read_text(encoding="utf-8"))
        assert "elements" in data
        assert len(data["elements"]) > 0

    def test_graph_summary(self, populated_graph: KnowledgeGraphBuilder) -> None:
        summary = graph_summary(populated_graph)
        assert "entities" in summary
        assert "relations" in summary
        assert "paper" in summary


================================================
FILE: tests/test_mcp.py
================================================
"""Tests for MCP integration (C3): Server, Client, Tools, Transport, Registry."""

from __future__ import annotations

import asyncio

import pytest

from researchclaw.mcp.tools import TOOL_DEFINITIONS, get_tool_schema, list_tool_names
from researchclaw.mcp.server import ResearchClawMCPServer
from researchclaw.mcp.client import MCPClient
from researchclaw.mcp.registry import MCPServerRegistry
from researchclaw.mcp.transport import SSETransport


# ══════════════════════════════════════════════════════════════════
# MCP Tools tests
# ══════════════════════════════════════════════════════════════════


class TestMCPTools:
    def test_tool_definitions_not_empty(self) -> None:
        assert len(TOOL_DEFINITIONS) >= 6

    def test_all_tools_have_required_fields(self) -> None:
        for tool in TOOL_DEFINITIONS:
            assert "name" in tool
            assert "description" in tool
            assert "inputSchema" in tool
            assert tool["inputSchema"]["type"] == "object"

    def test_get_tool_schema_exists(self) -> None:
        schema = get_tool_schema("run_pipeline")
        assert schema is not None
        assert schema["name"] == "run_pipeline"

    def test_get_tool_schema_missing(self) -> None:
        assert get_tool_schema("nonexistent") is None

    def test_list_tool_names(self) -> None:
        names = list_tool_names()
        assert "run_pipeline" in names
        assert "get_pipeline_status" in names
        assert "search_literature" in names

    def test_run_pipeline_requires_topic(self) -> None:
        schema = get_tool_schema("run_pipeline")
        assert schema is not None
        assert "topic" in schema["inputSchema"]["required"]

    def test_get_paper_has_format_enum(self) -> None:
        schema = get_tool_schema("get_paper")
        assert schema is not None
        props = schema["inputSchema"]["properties"]
        assert "format" in props
        assert "enum" in props["format"]


# ══════════════════════════════════════════════════════════════════
# MCP Server tests
# ══════════════════════════════════════════════════════════════════


class TestMCPServer:
    def test_get_tools(self) -> None:
        server = ResearchClawMCPServer()
        tools = server.get_tools()
        assert len(tools) >= 6
        names = [t["name"] for t in tools]
        assert "run_pipeline" in names

    def test_handle_unknown_tool(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("nonexistent", {}))
        assert result["success"] is False
        assert "Unknown tool" in result["error"]

    def test_handle_run_pipeline(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("run_pipeline", {"topic": "GNN"}))
        assert result["success"] is True
        assert "GNN" in result["message"]

    def test_handle_get_status_missing_run(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("get_pipeline_status", {"run_id": "nonexistent"}))
        assert result["success"] is False

    def test_handle_search_literature(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("search_literature", {"query": "transformers"}))
        assert result["success"] is True

    def test_handle_review_paper(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("review_paper", {"paper_path": "/tmp/paper.md"}))
        assert result["success"] is True

    def test_start_stop(self) -> None:
        server = ResearchClawMCPServer()
        assert not server.is_running

        async def _run() -> None:
            await server.start()
            assert server.is_running
            await server.stop()
            assert not server.is_running

        asyncio.run(_run())

    def test_handle_get_results_missing(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("get_experiment_results", {"run_id": "missing"}))
        assert result["success"] is False

    def test_handle_get_paper_missing(self) -> None:
        server = ResearchClawMCPServer()
        result = asyncio.run(server.handle_tool_call("get_paper", {"run_id": "missing"}))
        assert result["success"] is False


# ══════════════════════════════════════════════════════════════════
# MCP Client tests
# ══════════════════════════════════════════════════════════════════


class TestMCPClient:
    def test_init(self) -> None:
        client = MCPClient("http://localhost:3000")
        assert client.uri == "http://localhost:3000"
        assert not client.is_connected

    def test_connect_disconnect(self) -> None:
        client = MCPClient("http://localhost:3000")

        async def _run() -> None:
            await client.connect()
            assert client.is_connected
            await client.disconnect()
            assert not client.is_connected

        asyncio.run(_run())

    def test_list_tools_not_connected(self) -> None:
        client = MCPClient("http://localhost:3000")
        with pytest.raises(ConnectionError):
            asyncio.run(client.list_tools())

    def test_call_tool_not_connected(self) -> None:
        client = MCPClient("http://localhost:3000")
        with pytest.raises(ConnectionError):
            asyncio.run(client.call_tool("test", {}))

    def test_list_resources_not_connected(self) -> None:
        client = MCPClient("http://localhost:3000")
        with pytest.raises(ConnectionError):
            asyncio.run(client.list_resources())

    def test_read_resource_not_connected(self) -> None:
        client = MCPClient("http://localhost:3000")
        with pytest.raises(ConnectionError):
            asyncio.run(client.read_resource("test://resource"))

    def test_list_tools_connected(self) -> None:
        client = MCPClient("http://localhost:3000")

        async def _run() -> list:
            await client.connect()
            return await client.list_tools()

        tools = asyncio.run(_run())
        assert isinstance(tools, list)

    def test_tools_cached(self) -> None:
        client = MCPClient("http://localhost:3000")

        async def _run() -> tuple:
            await client.connect()
            t1 = await client.list_tools()
            t2 = await client.list_tools()
            return t1, t2

        t1, t2 = asyncio.run(_run())
        assert t1 is t2


# ══════════════════════════════════════════════════════════════════
# MCP Server Registry tests
# ══════════════════════════════════════════════════════════════════


class TestMCPServerRegistry:
    def test_register_and_list(self) -> None:
        async def _run() -> list:
            reg = MCPServerRegistry()
            await reg.register("test", "http://localhost:3000")
            return reg.list_all()

        servers = asyncio.run(_run())
        assert len(servers) == 1
        assert servers[0]["name"] == "test"
        assert servers[0]["connected"] is True

    def test_unregister(self) -> None:
        async def _run() -> int:
            reg = MCPServerRegistry()
            await reg.register("test", "http://localhost:3000")
            await reg.unregister("test")
            return reg.count

        count = asyncio.run(_run())
        assert count == 0

    def test_get(self) -> None:
        async def _run() -> MCPClient | None:
            reg = MCPServerRegistry()
            await reg.register("test", "http://localhost:3000")
            return reg.get("test")

        client = asyncio.run(_run())
        assert client is not None
        assert client.is_connected

    def test_get_missing(self) -> None:
        reg = MCPServerRegistry()
        assert reg.get("nonexistent") is None

    def test_close_all(self) -> None:
        async def _run() -> int:
            reg = MCPServerRegistry()
            await reg.register("a", "http://a:3000")
            await reg.register("b", "http://b:3000")
            await reg.close_all()
            return reg.count

        count = asyncio.run(_run())
        assert count == 0


# ══════════════════════════════════════════════════════════════════
# Transport tests
# ══════════════════════════════════════════════════════════════════


class TestSSETransport:
    def test_start_stop(self) -> None:
        transport = SSETransport(port=9999)

        async def _run() -> None:
            await transport.start()
            assert transport._running is True
            await transport.close()
            assert transport._running is False

        asyncio.run(_run())

    def test_receive_not_implemented(self) -> None:
        transport = SSETransport()
        with pytest.raises(NotImplementedError):
            asyncio.run(transport.receive())


================================================
FILE: tests/test_memory_system.py
================================================
"""Tests for the persistent memory system (40+ tests).

Covers:
- MemoryStore CRUD operations
- Vector embedding generation (mocked)
- Similarity retrieval
- Time decay computation
- Confidence updates
- Persistence (JSONL read/write)
- IdeationMemory, ExperimentMemory, WritingMemory
"""

from __future__ import annotations

import json
import math
from datetime import datetime, timezone, timedelta
from pathlib import Path

import pytest

from researchclaw.memory.store import MemoryEntry, MemoryStore, VALID_CATEGORIES
from researchclaw.memory.decay import time_decay_weight, confidence_update
from researchclaw.memory.embeddings import EmbeddingProvider, _tokenize, _hash_token
from researchclaw.memory.retriever import MemoryRetriever, cosine_similarity
from researchclaw.memory.ideation_memory import IdeationMemory
from researchclaw.memory.experiment_memory import ExperimentMemory
from researchclaw.memory.writing_memory import WritingMemory


# ── Fixtures ─────────────────────────────────────────────────────────


@pytest.fixture
def tmp_store_dir(tmp_path: Path) -> Path:
    d = tmp_path / "memory_store"
    d.mkdir()
    return d


@pytest.fixture
def store(tmp_store_dir: Path) -> MemoryStore:
    return MemoryStore(tmp_store_dir)


@pytest.fixture
def populated_store(store: MemoryStore) -> MemoryStore:
    store.add("ideation", "Topic: RL for robotics\nOutcome: success", {"run_id": "r1"})
    store.add("ideation", "Topic: Meta-learning\nOutcome: failure", {"run_id": "r2"})
    store.add("experiment", "Task: classification\nHP: lr=0.001", {"run_id": "r1"})
    store.add("experiment", "Trick: mixed precision\nImprovement: 5%", {"run_id": "r2"})
    store.add("writing", "Feedback: clarity\nResolution: rewrite", {"run_id": "r1"})
    return store


@pytest.fixture
def embedding_fn() -> object:
    """Simple deterministic embedding for testing."""
    def _embed(text: str) -> list[float]:
        vec = [0.0] * 16
        for i, ch in enumerate(text[:16]):
            vec[i] = ord(ch) / 256.0
        norm = math.sqrt(sum(v * v for v in vec)) or 1.0
        return [v / norm for v in vec]
    return _embed


# ── MemoryStore CRUD ─────────────────────────────────────────────────


class TestMemoryStoreCRUD:
    def test_add_entry(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "test content", {"key": "value"})
        assert entry_id
        assert store.count("ideation") == 1

    def test_add_invalid_category(self, store: MemoryStore) -> None:
        with pytest.raises(ValueError, match="Invalid category"):
            store.add("invalid_cat", "content")

    def test_add_all_categories(self, store: MemoryStore) -> None:
        for cat in VALID_CATEGORIES:
            store.add(cat, f"content for {cat}")
        assert store.count() == 3

    def test_get_entry(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "findme")
        entry = store.get(entry_id)
        assert entry is not None
        assert entry.content == "findme"
        assert entry.category == "ideation"

    def test_get_nonexistent(self, store: MemoryStore) -> None:
        assert store.get("nonexistent_id") is None

    def test_get_all_no_filter(self, populated_store: MemoryStore) -> None:
        all_entries = populated_store.get_all()
        assert len(all_entries) == 5

    def test_get_all_with_filter(self, populated_store: MemoryStore) -> None:
        ideation = populated_store.get_all("ideation")
        assert len(ideation) == 2

    def test_update_confidence_success(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "conf test", confidence=0.5)
        assert store.update_confidence(entry_id, 0.1)
        entry = store.get(entry_id)
        assert entry is not None
        assert abs(entry.confidence - 0.6) < 1e-6

    def test_update_confidence_clamp_high(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "test", confidence=0.95)
        store.update_confidence(entry_id, 0.2)
        entry = store.get(entry_id)
        assert entry is not None
        assert entry.confidence == 1.0

    def test_update_confidence_clamp_low(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "test", confidence=0.1)
        store.update_confidence(entry_id, -0.5)
        entry = store.get(entry_id)
        assert entry is not None
        assert entry.confidence == 0.0

    def test_update_confidence_nonexistent(self, store: MemoryStore) -> None:
        assert not store.update_confidence("nope", 0.1)

    def test_mark_accessed(self, store: MemoryStore) -> None:
        entry_id = store.add("ideation", "access test")
        entry = store.get(entry_id)
        assert entry is not None
        assert entry.access_count == 0
        store.mark_accessed(entry_id)
        entry = store.get(entry_id)
        assert entry is not None
        assert entry.access_count == 1

    def test_capacity_enforcement(self, tmp_store_dir: Path) -> None:
        store = MemoryStore(tmp_store_dir, max_entries_per_category=3)
        for i in range(5):
            store.add("ideation", f"entry {i}", confidence=i * 0.2)
        assert store.count("ideation") == 3
        # Should keep highest confidence entries
        entries = store.get_all("ideation")
        confidences = [e.confidence for e in entries]
        assert min(confidences) >= 0.4  # lowest 2 (0.0, 0.2) should be pruned

    def test_count_empty(self, store: MemoryStore) -> None:
        assert store.count() == 0
        assert store.count("ideation") == 0


# ── Persistence ──────────────────────────────────────────────────────


class TestMemoryPersistence:
    def test_save_and_load(self, tmp_store_dir: Path) -> None:
        store = MemoryStore(tmp_store_dir)
        store.add("ideation", "persistent content", {"key": "val"})
        store.add("experiment", "exp content")
        store.save()

        store2 = MemoryStore(tmp_store_dir)
        loaded = store2.load()
        assert loaded == 2
        assert store2.count() == 2

    def test_save_creates_directory(self, tmp_path: Path) -> None:
        new_dir = tmp_path / "new" / "nested" / "dir"
        store = MemoryStore(new_dir)
        store.add("ideation", "test")
        store.save()
        assert (new_dir / "ideation.jsonl").exists()

    def test_load_empty_dir(self, tmp_store_dir: Path) -> None:
        store = MemoryStore(tmp_store_dir)
        assert store.load() == 0

    def test_load_malformed_jsonl(self, tmp_store_dir: Path) -> None:
        (tmp_store_dir / "ideation.jsonl").write_text(
            '{"id": "a", "category": "ideation"}\nnot json\n',
            encoding="utf-8",
        )
        store = MemoryStore(tmp_store_dir)
        loaded = store.load()
        assert loaded == 1  # only valid entry loaded

    def test_roundtrip_preserves_data(self, tmp_store_dir: Path) -> None:
        store = MemoryStore(tmp_store_dir)
        entry_id = store.add(
            "experiment", "test content",
            metadata={"key": "value"},
            embedding=[0.1, 0.2, 0.3],
            confidence=0.7,
        )
        store.save()

        store2 = MemoryStore(tmp_store_dir)
        store2.load()
        entry = store2.get(entry_id)
        assert entry is not None
        assert entry.content == "test content"
        assert entry.metadata == {"key": "value"}
        assert entry.embedding == [0.1, 0.2, 0.3]
        assert abs(entry.confidence - 0.7) < 1e-6


# ── Prune ────────────────────────────────────────────────────────────


class TestMemoryPrune:
    def test_prune_low_confidence(self, store: MemoryStore) -> None:
        store.add("ideation", "low conf", confidence=0.1)
        store.add("ideation", "high conf", confidence=0.8)
        removed = store.prune(confidence_threshold=0.5)
        assert removed == 1
        assert store.count("ideation") == 1

    def test_prune_nothing_to_remove(self, store: MemoryStore) -> None:
        store.add("ideation", "good", confidence=0.9)
        removed = store.prune()
        assert removed == 0


# ── MemoryEntry ──────────────────────────────────────────────────────


class TestMemoryEntry:
    def test_to_dict(self) -> None:
        entry = MemoryEntry(
            id="abc", category="ideation", content="test",
            metadata={}, embedding=[], confidence=0.5,
            created_at="2024-01-01T00:00:00+00:00",
            last_accessed="2024-01-01T00:00:00+00:00",
            access_count=0,
        )
        d = entry.to_dict()
        assert d["id"] == "abc"
        assert d["category"] == "ideation"

    def test_from_dict(self) -> None:
        data = {
            "id": "xyz", "category": "experiment", "content": "hp test",
            "metadata": {"run": "1"}, "embedding": [0.1], "confidence": 0.6,
            "created_at": "2024-06-01T00:00:00+00:00",
            "last_accessed": "2024-06-01T00:00:00+00:00",
            "access_count": 3,
        }
        entry = MemoryEntry.from_dict(data)
        assert entry.id == "xyz"
        assert entry.access_count == 3

    def test_from_dict_defaults(self) -> None:
        entry = MemoryEntry.from_dict({})
        assert entry.id == ""
        assert entry.confidence == 0.5
        assert entry.access_count == 0


# ── Time Decay ───────────────────────────────────────────────────────


class TestTimeDecay:
    def test_fresh_entry(self) -> None:
        now = datetime.now(timezone.utc)
        w = time_decay_weight(now, half_life_days=90.0, now=now)
        assert abs(w - 1.0) < 1e-6

    def test_half_life(self) -> None:
        now = datetime.now(timezone.utc)
        half = now - timedelta(days=90)
        w = time_decay_weight(half, half_life_days=90.0, now=now)
        assert abs(w - 0.5) < 0.01

    def test_expired(self) -> None:
        now = datetime.now(timezone.utc)
        old = now - timedelta(days=400)
        w = time_decay_weight(old, half_life_days=90.0, max_age_days=365.0, now=now)
        assert w == 0.0

    def test_future_timestamp(self) -> None:
        now = datetime.now(timezone.utc)
        future = now + timedelta(days=10)
        w = time_decay_weight(future, now=now)
        assert w == 1.0

    def test_naive_datetime(self) -> None:
        now = datetime.now(timezone.utc)
        naive = now.replace(tzinfo=None)
        w = time_decay_weight(naive, now=now)
        assert w > 0.0


class TestConfidenceUpdate:
    def test_increase(self) -> None:
        assert confidence_update(0.5, 0.1) == 0.6

    def test_decrease(self) -> None:
        assert confidence_update(0.5, -0.2) == pytest.approx(0.3)

    def test_clamp_ceiling(self) -> None:
        assert confidence_update(0.95, 0.2) == 1.0

    def test_clamp_floor(self) -> None:
        assert confidence_update(0.1, -0.5) == 0.0


# ── Embeddings ───────────────────────────────────────────────────────


class TestEmbeddings:
    def test_tfidf_fallback(self) -> None:
        provider = EmbeddingProvider()
        vec = provider.embed("hello world test")
        assert len(vec) > 0
        assert isinstance(vec[0], float)

    def test_tfidf_normalized(self) -> None:
        provider = EmbeddingProvider()
        vec = provider.embed("deep learning neural network")
        norm = math.sqrt(sum(v * v for v in vec))
        assert abs(norm - 1.0) < 0.01

    def test_tfidf_empty(self) -> None:
        provider = EmbeddingProvider()
        # Force TF-IDF backend to test zero-vector behavior
        provider._backend = "tfidf"
        provider._dim = 256
        vec = provider.embed("")
        assert all(v == 0.0 for v in vec)

    def test_tokenize(self) -> None:
        tokens = _tokenize("Hello, World! 123")
        assert "hello" in tokens
        assert "world" in tokens
        assert "123" in tokens

    def test_hash_token_deterministic(self) -> None:
        a = _hash_token("test", 256)
        b = _hash_token("test", 256)
        assert a == b

    def test_embed_batch(self) -> None:
        provider = EmbeddingProvider()
        vecs = provider.embed_batch(["hello", "world"])
        assert len(vecs) == 2

    def test_backend_detection(self) -> None:
        provider = EmbeddingProvider()
        backend = provider.backend
        assert backend in ("api", "sentence_transformers", "tfidf")


# ── Retriever ────────────────────────────────────────────────────────


class TestRetriever:
    def test_cosine_similarity_identical(self) -> None:
        vec = [1.0, 0.0, 0.0]
        assert abs(cosine_similarity(vec, vec) - 1.0) < 1e-6

    def test_cosine_similarity_orthogonal(self) -> None:
        a = [1.0, 0.0]
        b = [0.0, 1.0]
        assert abs(cosine_similarity(a, b)) < 1e-6

    def test_cosine_similarity_opposite(self) -> None:
        a = [1.0, 0.0]
        b = [-1.0, 0.0]
        assert abs(cosine_similarity(a, b) + 1.0) < 1e-6

    def test_cosine_similarity_empty(self) -> None:
        assert cosine_similarity([], []) == 0.0

    def test_cosine_similarity_mismatched_length(self) -> None:
        assert cosine_similarity([1.0], [1.0, 2.0]) == 0.0

    def test_recall_empty_store(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        results = retriever.recall([0.1, 0.2], category="ideation")
        assert results == []

    def test_recall_returns_results(self, store: MemoryStore) -> None:
        store.add("ideation", "RL research", embedding=[1.0, 0.0, 0.0])
        store.add("ideation", "NLP research", embedding=[0.0, 1.0, 0.0])
        retriever = MemoryRetriever(store)
        results = retriever.recall([0.9, 0.1, 0.0], category="ideation", top_k=1)
        assert len(results) == 1
        assert "RL" in results[0][0].content

    def test_recall_respects_top_k(self, store: MemoryStore) -> None:
        for i in range(10):
            store.add("ideation", f"entry {i}", embedding=[float(i)] * 3)
        retriever = MemoryRetriever(store)
        results = retriever.recall([5.0, 5.0, 5.0], top_k=3)
        assert len(results) == 3

    def test_format_for_prompt(self, store: MemoryStore) -> None:
        store.add("ideation", "Topic: RL", embedding=[1.0])
        retriever = MemoryRetriever(store)
        results = retriever.recall([1.0])
        text = retriever.format_for_prompt(results)
        assert "ideation" in text

    def test_format_for_prompt_empty(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        text = retriever.format_for_prompt([])
        assert text == ""


# ── Ideation Memory ──────────────────────────────────────────────────


class TestIdeationMemory:
    def test_record_topic_success(self, store: MemoryStore, embedding_fn: object) -> None:
        retriever = MemoryRetriever(store)
        im = IdeationMemory(store, retriever, embed_fn=embedding_fn)
        entry_id = im.record_topic_outcome("RL for robotics", "success", 8.0)
        assert entry_id
        assert store.count("ideation") == 1

    def test_record_topic_failure(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        im = IdeationMemory(store, retriever)
        im.record_topic_outcome("Bad topic", "failure", 2.0, run_id="r1")
        entries = store.get_all("ideation")
        assert entries[0].metadata["outcome"] == "failure"

    def test_record_hypothesis(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        im = IdeationMemory(store, retriever)
        im.record_hypothesis("H1: X is better than Y", True, "Validated")
        assert store.count("ideation") == 1

    def test_get_anti_patterns(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        im = IdeationMemory(store, retriever)
        im.record_topic_outcome("Bad direction", "failure", 1.0)
        im.record_topic_outcome("Good direction", "success", 9.0)
        patterns = im.get_anti_patterns()
        assert len(patterns) == 1
        assert "Bad" in patterns[0]

    def test_recall_similar_topics_empty(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        im = IdeationMemory(store, retriever)
        result = im.recall_similar_topics("test query")
        assert result == ""


# ── Experiment Memory ────────────────────────────────────────────────


class TestExperimentMemory:
    def test_record_hyperparams(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        em = ExperimentMemory(store, retriever)
        em.record_hyperparams("image_cls", {"lr": 0.001, "bs": 32}, 0.95)
        assert store.count("experiment") == 1

    def test_record_architecture(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        em = ExperimentMemory(store, retriever)
        em.record_architecture("image_cls", "ResNet-18", 0.96)
        entry = store.get_all("experiment")[0]
        assert "ResNet" in entry.content

    def test_record_training_trick(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        em = ExperimentMemory(store, retriever)
        em.record_training_trick("CosineAnnealing", 0.03, "CIFAR-10 training")
        entry = store.get_all("experiment")[0]
        assert "CosineAnnealing" in entry.content

    def test_recall_best_configs_empty(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        em = ExperimentMemory(store, retriever)
        result = em.recall_best_configs("anything")
        assert result == ""


# ── Writing Memory ───────────────────────────────────────────────────


class TestWritingMemory:
    def test_record_review_feedback(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        wm = WritingMemory(store, retriever)
        wm.record_review_feedback("clarity", "Section 3 is unclear", "Rewrote S3")
        assert store.count("writing") == 1

    def test_record_successful_structure(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        wm = WritingMemory(store, retriever)
        wm.record_successful_structure("intro", "Problem-Gap-Contribution", 8.5)
        entry = store.get_all("writing")[0]
        assert entry.metadata["section"] == "intro"

    def test_recall_writing_tips_empty(self, store: MemoryStore) -> None:
        retriever = MemoryRetriever(store)
        wm = WritingMemory(store, retriever)
        result = wm.recall_writing_tips("method", "RL paper")
        assert result == ""


================================================
FILE: tests/test_metaclaw_bridge/__init__.py
================================================


================================================
FILE: tests/test_metaclaw_bridge/test_config.py
================================================
"""Tests for MetaClaw bridge configuration parsing."""

from researchclaw.config import RCConfig


def _minimal_config_data(**overrides):
    """Return minimal valid config data with metaclaw_bridge overrides."""
    base = {
        "project": {"name": "test", "mode": "full-auto"},
        "research": {"topic": "test topic", "domains": ["ml"]},
        "runtime": {"timezone": "UTC"},
        "notifications": {"channel": "console"},
        "knowledge_base": {"backend": "markdown", "root": "docs/kb"},
        "llm": {
            "provider": "openai-compatible",
            "base_url": "http://localhost:8080",
            "api_key_env": "TEST_KEY",
            "api_key": "sk-test",
            "primary_model": "gpt-4o",
        },
    }
    base.update(overrides)
    return base


def test_metaclaw_bridge_defaults():
    """MetaClaw bridge should have sensible defaults when not configured."""
    data = _minimal_config_data()
    cfg = RCConfig.from_dict(data, check_paths=False)
    assert cfg.metaclaw_bridge.enabled is False
    assert cfg.metaclaw_bridge.proxy_url == "http://localhost:30000"
    assert cfg.metaclaw_bridge.prm.enabled is False
    assert cfg.metaclaw_bridge.lesson_to_skill.enabled is True


def test_metaclaw_bridge_enabled():
    """MetaClaw bridge config should be parsed when provided."""
    data = _minimal_config_data(
        metaclaw_bridge={
            "enabled": True,
            "proxy_url": "http://localhost:31000",
            "skills_dir": "/tmp/skills",
            "prm": {
                "enabled": True,
                "api_base": "http://localhost:8080",
                "api_key": "test-key",
                "model": "gpt-5.4",
                "votes": 5,
                "gate_stages": [5, 20],
            },
            "lesson_to_skill": {
                "enabled": True,
                "min_severity": "warning",
                "max_skills_per_run": 5,
            },
        }
    )
    cfg = RCConfig.from_dict(data, check_paths=False)
    assert cfg.metaclaw_bridge.enabled is True
    assert cfg.metaclaw_bridge.proxy_url == "http://localhost:31000"
    assert cfg.metaclaw_bridge.prm.enabled is True
    assert cfg.metaclaw_bridge.prm.votes == 5
    assert cfg.metaclaw_bridge.prm.gate_stages == (5, 20)
    assert cfg.metaclaw_bridge.lesson_to_skill.min_severity == "warning"
    assert cfg.metaclaw_bridge.lesson_to_skill.max_skills_per_run == 5


def test_metaclaw_bridge_none_is_default():
    """When metaclaw_bridge is None/missing, defaults should apply."""
    data = _minimal_config_data(metaclaw_bridge=None)
    cfg = RCConfig.from_dict(data, check_paths=False)
    assert cfg.metaclaw_bridge.enabled is False


================================================
FILE: tests/test_metaclaw_bridge/test_lesson_to_skill.py
================================================
"""Tests for lesson-to-skill conversion module."""

import json
import tempfile
from pathlib import Path

from researchclaw.metaclaw_bridge.lesson_to_skill import (
    _format_lessons,
    _list_existing_skill_names,
    _parse_skills_response,
    _write_skill,
)
from researchclaw.evolution import LessonEntry


def _make_lesson(stage: str = "experiment_run", severity: str = "error") -> LessonEntry:
    return LessonEntry(
        stage_name=stage,
        stage_num=12,
        category="experiment",
        severity=severity,
        description="Metric NaN detected in loss computation",
        timestamp="2026-03-15T00:00:00+00:00",
        run_id="test-001",
    )


def test_format_lessons():
    lessons = [_make_lesson(), _make_lesson("code_generation")]
    text = _format_lessons(lessons)
    assert "experiment_run" in text
    assert "code_generation" in text
    assert "NaN" in text


def test_list_existing_skills(tmp_path):
    (tmp_path / "skill-a").mkdir()
    (tmp_path / "skill-b").mkdir()
    (tmp_path / "not-a-skill.txt").write_text("x")
    names = _list_existing_skill_names(tmp_path)
    assert "skill-a" in names
    assert "skill-b" in names
    assert "not-a-skill.txt" not in names


def test_list_existing_skills_missing_dir():
    names = _list_existing_skill_names(Path("/nonexistent/dir"))
    assert names == []


def test_parse_skills_response_valid():
    response = json.dumps([
        {
            "name": "arc-fix-nan",
            "description": "Prevent NaN in loss",
            "category": "coding",
            "content": "# Fix NaN\n1. Check inputs\n2. Use grad clipping",
        }
    ])
    parsed = _parse_skills_response(response)
    assert len(parsed) == 1
    assert parsed[0]["name"] == "arc-fix-nan"


def test_parse_skills_response_with_code_fence():
    response = "```json\n" + json.dumps([
        {
            "name": "arc-test",
            "description": "test",
            "category": "coding",
            "content": "test content",
        }
    ]) + "\n```"
    parsed = _parse_skills_response(response)
    assert len(parsed) == 1


def test_parse_skills_response_invalid():
    assert _parse_skills_response("not json") == []
    assert _parse_skills_response("[]") == []


def test_write_skill(tmp_path):
    skill = {
        "name": "arc-test-skill",
        "description": "A test skill",
        "category": "coding",
        "content": "# Test\n1. Do something",
    }
    path = _write_skill(tmp_path, skill)
    assert path is not None
    assert path.exists()
    content = path.read_text()
    assert "name: arc-test-skill" in content
    assert "category: coding" in content
    assert "# Test" in content


================================================
FILE: tests/test_metaclaw_bridge/test_prm_gate.py
================================================
"""Tests for PRM quality gate module."""

from unittest.mock import patch, MagicMock

from researchclaw.metaclaw_bridge.prm_gate import (
    ResearchPRMGate,
    _GATE_INSTRUCTIONS,
)


def test_gate_instructions_cover_expected_stages():
    """PRM gate instructions should cover key gate stages."""
    assert 5 in _GATE_INSTRUCTIONS
    assert 9 in _GATE_INSTRUCTIONS
    assert 15 in _GATE_INSTRUCTIONS
    assert 20 in _GATE_INSTRUCTIONS


def test_should_gate():
    gate = ResearchPRMGate(
        api_base="http://test",
        api_key="test",
    )
    assert gate.should_gate(5) is True
    assert gate.should_gate(9) is True
    assert gate.should_gate(15) is True
    assert gate.should_gate(20) is True
    assert gate.should_gate(1) is False
    assert gate.should_gate(10) is False


def test_from_bridge_config_disabled():
    """Should return None when PRM is not enabled."""
    config = MagicMock()
    config.enabled = False
    assert ResearchPRMGate.from_bridge_config(config) is None


def test_from_bridge_config_enabled():
    """Should create a gate when properly configured."""
    config = MagicMock()
    config.enabled = True
    config.api_base = "http://test"
    config.api_key = "test-key"
    config.api_key_env = ""
    config.model = "gpt-5.4"
    config.votes = 3
    config.temperature = 0.6

    gate = ResearchPRMGate.from_bridge_config(config)
    assert gate is not None
    assert gate.api_base == "http://test"
    assert gate.votes == 3


@patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call")
def test_evaluate_stage_majority_pass(mock_call):
    """Should return 1.0 when majority votes pass."""
    mock_call.side_effect = [1.0, 1.0, -1.0]
    gate = ResearchPRMGate(
        api_base="http://test",
        api_key="test",
        votes=3,
    )
    score = gate.evaluate_stage(20, "This is a good paper.")
    assert score == 1.0


@patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call")
def test_evaluate_stage_majority_fail(mock_call):
    """Should return -1.0 when majority votes fail."""
    mock_call.side_effect = [-1.0, -1.0, 1.0]
    gate = ResearchPRMGate(
        api_base="http://test",
        api_key="test",
        votes=3,
    )
    score = gate.evaluate_stage(20, "This paper has critical issues.")
    assert score == -1.0


@patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call")
def test_evaluate_stage_all_failed(mock_call):
    """Should return 0.0 when all judge calls fail."""
    mock_call.side_effect = [None, None, None]
    gate = ResearchPRMGate(
        api_base="http://test",
        api_key="test",
        votes=3,
    )
    score = gate.evaluate_stage(20, "test")
    assert score == 0.0


================================================
FILE: tests/test_metaclaw_bridge/test_session.py
================================================
"""Tests for MetaClaw session management module."""

from researchclaw.metaclaw_bridge.session import MetaClawSession


def test_session_creation():
    session = MetaClawSession("test-run-001")
    assert session.session_id == "arc-test-run-001"
    assert session.is_active is True


def test_session_headers():
    session = MetaClawSession("run-123")
    headers = session.get_headers("hypothesis_gen")
    assert headers["X-Session-Id"] == "arc-run-123"
    assert headers["X-Turn-Type"] == "main"
    assert headers["X-AutoRC-Stage"] == "hypothesis_gen"


def test_session_headers_no_stage():
    session = MetaClawSession("run-123")
    headers = session.get_headers()
    assert "X-AutoRC-Stage" not in headers


def test_session_end():
    session = MetaClawSession("run-456")
    end_headers = session.end()
    assert end_headers["X-Session-Done"] == "true"
    assert end_headers["X-Session-Id"] == "arc-run-456"
    assert session.is_active is False


================================================
FILE: tests/test_metaclaw_bridge/test_skill_feedback.py
================================================
"""Tests for skill feedback tracking module."""

from pathlib import Path

from researchclaw.metaclaw_bridge.skill_feedback import (
    SkillEffectivenessRecord,
    SkillFeedbackStore,
    record_stage_skills,
)


def test_append_and_load(tmp_path):
    store = SkillFeedbackStore(tmp_path / "feedback.jsonl")
    rec = SkillEffectivenessRecord(
        skill_name="hypothesis-formulation",
        stage_name="hypothesis_gen",
        run_id="test-001",
        stage_success=True,
        timestamp="2026-03-15T00:00:00+00:00",
    )
    store.append(rec)

    loaded = store.load_all()
    assert len(loaded) == 1
    assert loaded[0].skill_name == "hypothesis-formulation"
    assert loaded[0].stage_success is True


def test_append_many(tmp_path):
    store = SkillFeedbackStore(tmp_path / "feedback.jsonl")
    records = [
        SkillEffectivenessRecord("skill-a", "stage-1", "run-1", True, "2026-01-01"),
        SkillEffectivenessRecord("skill-b", "stage-2", "run-1", False, "2026-01-01"),
    ]
    store.append_many(records)
    assert len(store.load_all()) == 2


def test_compute_stats(tmp_path):
    store = SkillFeedbackStore(tmp_path / "feedback.jsonl")
    records = [
        SkillEffectivenessRecord("skill-a", "s1", "r1", True, "t1"),
        SkillEffectivenessRecord("skill-a", "s2", "r1", False, "t1"),
        SkillEffectivenessRecord("skill-a", "s3", "r2", True, "t2"),
        SkillEffectivenessRecord("skill-b", "s1", "r1", False, "t1"),
    ]
    store.append_many(records)

    stats = store.compute_skill_stats()
    assert stats["skill-a"]["total"] == 3
    assert stats["skill-a"]["successes"] == 2
    assert abs(stats["skill-a"]["success_rate"] - 2 / 3) < 0.01
    assert stats["skill-b"]["total"] == 1
    assert stats["skill-b"]["success_rate"] == 0.0


def test_record_stage_skills(tmp_path):
    store = SkillFeedbackStore(tmp_path / "feedback.jsonl")
    record_stage_skills(
        store,
        stage_name="hypothesis_gen",
        run_id="test-002",
        stage_success=True,
        active_skills=["hypothesis-formulation", "research-gap-identification"],
    )
    loaded = store.load_all()
    assert len(loaded) == 2
    names = {r.skill_name for r in loaded}
    assert names == {"hypothesis-formulation", "research-gap-identification"}


def test_empty_store(tmp_path):
    store = SkillFeedbackStore(tmp_path / "nonexistent.jsonl")
    assert store.load_all() == []
    assert store.compute_skill_stats() == {}


================================================
FILE: tests/test_metaclaw_bridge/test_stage_skill_map.py
================================================
"""Tests for stage-skill mapping module."""

from researchclaw.metaclaw_bridge.stage_skill_map import (
    STAGE_SKILL_MAP,
    LESSON_CATEGORY_TO_SKILL_CATEGORY,
    get_stage_config,
)


def test_all_23_stages_mapped():
    """All 23 pipeline stages should have a mapping entry."""
    expected_stages = [
        "topic_init", "problem_decompose", "search_strategy",
        "literature_collect", "literature_screen", "knowledge_extract",
        "synthesis", "hypothesis_gen", "experiment_design",
        "code_generation", "resource_planning", "experiment_run",
        "iterative_refine", "result_analysis", "research_decision",
        "paper_outline", "paper_draft", "peer_review",
        "paper_revision", "quality_gate", "knowledge_archive",
        "export_publish", "citation_verify",
    ]
    for stage in expected_stages:
        assert stage in STAGE_SKILL_MAP, f"Missing mapping for {stage}"


def test_stage_config_has_required_keys():
    """Each stage config should have task_type, skills, and top_k."""
    for stage_name, config in STAGE_SKILL_MAP.items():
        assert "task_type" in config, f"{stage_name} missing task_type"
        assert "skills" in config, f"{stage_name} missing skills"
        assert "top_k" in config, f"{stage_name} missing top_k"
        assert isinstance(config["skills"], list)
        assert isinstance(config["top_k"], int)
        assert config["top_k"] > 0


def test_get_stage_config_known():
    cfg = get_stage_config("hypothesis_gen")
    assert cfg["task_type"] == "research"
    assert "hypothesis-formulation" in cfg["skills"]


def test_get_stage_config_unknown_returns_default():
    cfg = get_stage_config("nonexistent_stage")
    assert cfg["task_type"] == "research"
    assert cfg["top_k"] == 4


def test_lesson_category_mapping_complete():
    """All lesson categories should map to a skill category."""
    expected = ["system", "experiment", "writing", "analysis", "literature", "pipeline"]
    for cat in expected:
        assert cat in LESSON_CATEGORY_TO_SKILL_CATEGORY


================================================
FILE: tests/test_metric_parser.py
================================================
"""Tests for the universal metric parser."""

from __future__ import annotations

import json
import math
import pytest
from pathlib import Path

from researchclaw.experiment.metrics import (
    ExperimentResults,
    MetricType,
    UniversalMetricParser,
)


@pytest.fixture
def parser():
    return UniversalMetricParser()


@pytest.fixture
def tmp_run_dir(tmp_path):
    return tmp_path


# ---------------------------------------------------------------------------
# JSON parsing tests
# ---------------------------------------------------------------------------


class TestJSONParsing:
    def test_parse_comparison_results(self, parser, tmp_run_dir):
        data = {
            "experiment_type": "comparison",
            "conditions": {
                "proposed_method": {
                    "seed_42": {"accuracy": 0.95, "f1": 0.93},
                    "seed_123": {"accuracy": 0.94, "f1": 0.92},
                },
                "baseline": {
                    "seed_42": {"accuracy": 0.88, "f1": 0.85},
                },
            },
            "metadata": {
                "domain": "ml_vision",
                "total_runtime_sec": 120.5,
            },
        }
        (tmp_run_dir / "results.json").write_text(json.dumps(data))

        result = parser.parse(tmp_run_dir)
        assert result.source == "json"
        assert result.experiment_type == "comparison"
        assert result.domain == "ml_vision"
        assert "proposed_method" in result.conditions
        flat = result.to_flat_metrics()
        assert "proposed_method/accuracy" in flat

    def test_parse_convergence_results(self, parser, tmp_run_dir):
        data = {
            "experiment_type": "convergence",
            "convergence": {
                "euler": [
                    {"h": 0.1, "error": 0.05},
                    {"h": 0.05, "error": 0.012},
                    {"h": 0.025, "error": 0.003},
                ],
                "rk4": [
                    {"h": 0.1, "error": 0.001},
                    {"h": 0.05, "error": 6.25e-5},
                    {"h": 0.025, "error": 3.9e-6},
                ],
            },
        }
        (tmp_run_dir / "results.json").write_text(json.dumps(data))

        result = parser.parse(tmp_run_dir)
        assert result.source == "json"
        assert "euler" in result.convergence
        assert len(result.convergence["euler"]) == 3
        flat = result.to_flat_metrics()
        assert "euler/error" in flat  # last point

    def test_parse_regression_table(self, parser, tmp_run_dir):
        data = {
            "experiment_type": "progressive_spec",
            "regression_table": {
                "spec_1_ols": {"coeff": 0.15, "se": 0.03, "p": 0.001, "n": 5000, "r2": 0.12},
                "spec_2_fe": {"coeff": 0.11, "se": 0.02, "p": 0.001, "n": 5000, "r2": 0.35},
            },
        }
        (tmp_run_dir / "results.json").write_text(json.dumps(data))

        result = parser.parse(tmp_run_dir)
        assert result.source == "json"
        assert "spec_1_ols" in result.regression_table
        flat = result.to_flat_metrics()
        assert "spec_1_ols/coeff" in flat
        assert flat["spec_1_ols/coeff"] == 0.15

    def test_parse_top_level_scalars(self, parser, tmp_run_dir):
        data = {"accuracy": 0.95, "loss": 0.32}
        (tmp_run_dir / "results.json").write_text(json.dumps(data))

        result = parser.parse(tmp_run_dir)
        assert result.scalars["accuracy"] == 0.95
        assert result.scalars["loss"] == 0.32

    def test_skip_nan_inf(self, parser, tmp_run_dir):
        data = {
            "conditions": {
                "method": {
                    "seed_1": {"accuracy": float("nan"), "f1": 0.9},
                },
            },
        }
        (tmp_run_dir / "results.json").write_text(json.dumps(data))

        result = parser.parse(tmp_run_dir)
        flat = result.to_flat_metrics()
        # NaN should be excluded
        for k, v in flat.items():
            assert math.isfinite(v), f"Non-finite value: {k}={v}"

    def test_invalid_json_falls_through(self, parser, tmp_run_dir):
        (tmp_run_dir / "results.json").write_text("not valid json{{{")
        result = parser.parse(tmp_run_dir, stdout="metric_a: 0.5")
        # Should fallback to stdout
        assert result.source == "stdout"


# ---------------------------------------------------------------------------
# CSV parsing tests
# ---------------------------------------------------------------------------


class TestCSVParsing:
    def test_parse_condition_csv(self, parser, tmp_run_dir):
        csv_content = "condition,seed,metric,value\nmethod_a,42,accuracy,0.95\nmethod_b,42,accuracy,0.88\n"
        (tmp_run_dir / "results.csv").write_text(csv_content)

        result = parser.parse(tmp_run_dir)
        assert result.source == "csv"
        assert "method_a/accuracy" in result.scalars
        assert result.scalars["method_a/accuracy"] == 0.95

    def test_parse_convergence_csv(self, parser, tmp_run_dir):
        csv_content = "method,h,error\neuler,0.1,0.05\neuler,0.05,0.012\nrk4,0.1,0.001\n"
        (tmp_run_dir / "results.csv").write_text(csv_content)

        result = parser.parse(tmp_run_dir)
        assert result.source == "csv"
        assert "euler" in result.convergence
        assert len(result.convergence["euler"]) == 2

    def test_csv_skip_invalid(self, parser, tmp_run_dir):
        csv_content = "condition,metric,value\nmethod,accuracy,not_a_number\n"
        (tmp_run_dir / "results.csv").write_text(csv_content)

        result = parser.parse(tmp_run_dir)
        assert result.source == "csv"
        assert len(result.scalars) == 0


# ---------------------------------------------------------------------------
# stdout fallback tests
# ---------------------------------------------------------------------------


class TestStdoutParsing:
    def test_parse_plain_metrics(self, parser, tmp_run_dir):
        result = parser.parse(tmp_run_dir, stdout="accuracy: 0.95\nloss: 0.32\n")
        assert result.source == "stdout"
        assert result.scalars["accuracy"] == 0.95
        assert result.scalars["loss"] == 0.32

    def test_parse_condition_metrics(self, parser, tmp_run_dir):
        stdout = "condition=method_a accuracy: 0.95\ncondition=method_b accuracy: 0.88\n"
        result = parser.parse(tmp_run_dir, stdout=stdout)
        assert result.source == "stdout"
        assert "method_a/accuracy" in result.scalars

    def test_fallback_to_stdout_log(self, parser, tmp_run_dir):
        (tmp_run_dir / "stdout.log").write_text("metric_x: 1.5\n")
        result = parser.parse(tmp_run_dir)
        assert result.source == "stdout"
        assert result.scalars.get("metric_x") == 1.5


# ---------------------------------------------------------------------------
# ExperimentResults tests
# ---------------------------------------------------------------------------


class TestExperimentResults:
    def test_to_flat_metrics_empty(self):
        result = ExperimentResults()
        assert result.to_flat_metrics() == {}

    def test_to_flat_metrics_scalars(self):
        result = ExperimentResults(scalars={"a": 1.0, "b": 2.0})
        flat = result.to_flat_metrics()
        assert flat["a"] == 1.0
        assert flat["b"] == 2.0

    def test_to_flat_metrics_conditions(self):
        result = ExperimentResults(
            conditions={
                "method": {"seed_1": {"acc": 0.9}, "seed_2": {"acc": 0.91}},
            }
        )
        flat = result.to_flat_metrics()
        assert "method/acc" in flat

    def test_to_flat_metrics_convergence(self):
        result = ExperimentResults(
            convergence={
                "euler": [
                    {"h": 0.1, "error": 0.05},
                    {"h": 0.05, "error": 0.01},
                ],
            }
        )
        flat = result.to_flat_metrics()
        assert "euler/error" in flat
        assert flat["euler/error"] == 0.01  # last point

    def test_to_flat_metrics_regression(self):
        result = ExperimentResults(
            regression_table={
                "ols": {"coeff": 0.5, "se": 0.1},
            }
        )
        flat = result.to_flat_metrics()
        assert flat["ols/coeff"] == 0.5


# ---------------------------------------------------------------------------
# Priority tests (JSON > CSV > stdout)
# ---------------------------------------------------------------------------


class TestParsePriority:
    def test_json_takes_priority_over_csv(self, parser, tmp_run_dir):
        (tmp_run_dir / "results.json").write_text('{"from_json": 1.0}')
        (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n")

        result = parser.parse(tmp_run_dir)
        assert result.source == "json"

    def test_csv_takes_priority_over_stdout(self, parser, tmp_run_dir):
        (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n")

        result = parser.parse(tmp_run_dir, stdout="stdout_metric: 3.0")
        assert result.source == "csv"

    def test_empty_json_falls_to_csv(self, parser, tmp_run_dir):
        (tmp_run_dir / "results.json").write_text("{}")
        (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n")

        result = parser.parse(tmp_run_dir)
        assert result.source == "csv"


# ---------------------------------------------------------------------------
# MetricType enum tests
# ---------------------------------------------------------------------------


class TestMetricType:
    def test_values(self):
        assert MetricType.SCALAR.value == "scalar"
        assert MetricType.TABLE.value == "table"
        assert MetricType.CONVERGENCE.value == "convergence"
        assert MetricType.STRUCTURED.value == "structured"


================================================
FILE: tests/test_minimax_provider.py
================================================
"""Tests for MiniMax provider integration.

Covers: provider preset, CLI registration, factory wiring,
temperature clamping, and live API integration.
"""

from __future__ import annotations

import json
import os
import urllib.request
from types import SimpleNamespace
from typing import Any, Mapping

import pytest

from researchclaw.llm import PROVIDER_PRESETS, create_llm_client
from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

class _DummyHTTPResponse:
    """Minimal stub for ``urllib.request.urlopen`` results."""

    def __init__(self, payload: Mapping[str, Any]):
        self._payload = payload

    def read(self) -> bytes:
        return json.dumps(self._payload).encode("utf-8")

    def __enter__(self) -> _DummyHTTPResponse:
        return self

    def __exit__(self, *a: object) -> None:
        return None


def _make_minimax_client(
    *,
    api_key: str = "test-minimax-key",
    primary_model: str = "MiniMax-M2.5",
    fallback_models: list[str] | None = None,
) -> LLMClient:
    config = LLMConfig(
        base_url="https://api.minimax.io/v1",
        api_key=api_key,
        primary_model=primary_model,
        fallback_models=fallback_models or ["MiniMax-M2.5-highspeed"],
    )
    return LLMClient(config)


# ---------------------------------------------------------------------------
# Unit tests — provider preset
# ---------------------------------------------------------------------------


class TestMiniMaxPreset:
    """Verify MiniMax is registered in PROVIDER_PRESETS."""

    def test_minimax_in_provider_presets(self):
        assert "minimax" in PROVIDER_PRESETS

    def test_minimax_base_url(self):
        assert PROVIDER_PRESETS["minimax"]["base_url"] == "https://api.minimax.io/v1"


# ---------------------------------------------------------------------------
# Unit tests — from_rc_config wiring
# ---------------------------------------------------------------------------


class TestMiniMaxFromRCConfig:
    """Verify that LLMClient.from_rc_config resolves MiniMax preset."""

    def test_from_rc_config_sets_minimax_base_url(self):
        rc_config = SimpleNamespace(
            llm=SimpleNamespace(
                provider="minimax",
                base_url="",
                api_key="mk-test",
                api_key_env="",
                primary_model="MiniMax-M2.5",
                fallback_models=("MiniMax-M2.5-highspeed",),
            ),
        )
        client = LLMClient.from_rc_config(rc_config)
        assert client.config.base_url == "https://api.minimax.io/v1"
        assert client.config.api_key == "mk-test"
        assert client.config.primary_model == "MiniMax-M2.5"
        assert client.config.fallback_models == ["MiniMax-M2.5-highspeed"]

    def test_from_rc_config_reads_minimax_api_key_from_env(self, monkeypatch):
        monkeypatch.setenv("MINIMAX_API_KEY", "env-minimax-key")
        rc_config = SimpleNamespace(
            llm=SimpleNamespace(
                provider="minimax",
                base_url="",
                api_key="",
                api_key_env="MINIMAX_API_KEY",
                primary_model="MiniMax-M2.5",
                fallback_models=(),
            ),
        )
        client = LLMClient.from_rc_config(rc_config)
        assert client.config.api_key == "env-minimax-key"

    def test_from_rc_config_custom_base_url_overrides_preset(self):
        rc_config = SimpleNamespace(
            llm=SimpleNamespace(
                provider="minimax",
                base_url="https://custom-proxy.example/v1",
                api_key="mk-test",
                api_key_env="",
                primary_model="MiniMax-M2.5",
                fallback_models=(),
            ),
        )
        client = LLMClient.from_rc_config(rc_config)
        assert client.config.base_url == "https://custom-proxy.example/v1"


# ---------------------------------------------------------------------------
# Unit tests — temperature clamping
# ---------------------------------------------------------------------------


class TestMiniMaxTemperatureClamping:
    """MiniMax API requires temperature in [0, 1.0]."""

    def _capture_body(
        self,
        monkeypatch: pytest.MonkeyPatch,
        client: LLMClient,
        temperature: float,
    ) -> dict[str, Any]:
        captured: dict[str, Any] = {}

        def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
            captured["body"] = json.loads(req.data.decode("utf-8"))
            return _DummyHTTPResponse(
                {"choices": [{"message": {"content": "ok"}, "finish_reason": "stop"}]}
            )

        monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
        client._raw_call(
            "MiniMax-M2.5",
            [{"role": "user", "content": "hi"}],
            1024,
            temperature,
            False,
        )
        return captured["body"]

    def test_temperature_above_one_clamped(self, monkeypatch):
        client = _make_minimax_client()
        body = self._capture_body(monkeypatch, client, 1.5)
        assert body["temperature"] == 1.0

    def test_temperature_within_range_unchanged(self, monkeypatch):
        client = _make_minimax_client()
        body = self._capture_body(monkeypatch, client, 0.7)
        assert body["temperature"] == 0.7

    def test_temperature_zero_allowed(self, monkeypatch):
        client = _make_minimax_client()
        body = self._capture_body(monkeypatch, client, 0.0)
        assert body["temperature"] == 0.0

    def test_temperature_negative_clamped_to_zero(self, monkeypatch):
        client = _make_minimax_client()
        body = self._capture_body(monkeypatch, client, -0.1)
        assert body["temperature"] == 0.0

    def test_non_minimax_url_no_clamping(self, monkeypatch):
        """Non-MiniMax URLs should not clamp temperature."""
        config = LLMConfig(
            base_url="https://api.openai.com/v1",
            api_key="test-key",
            primary_model="gpt-4o",
        )
        client = LLMClient(config)
        captured: dict[str, Any] = {}

        def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
            captured["body"] = json.loads(req.data.decode("utf-8"))
            return _DummyHTTPResponse(
                {"choices": [{"message": {"content": "ok"}, "finish_reason": "stop"}]}
            )

        monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
        client._raw_call("gpt-4o", [{"role": "user", "content": "hi"}], 1024, 1.5, False)
        assert captured["body"]["temperature"] == 1.5  # no clamping


# ---------------------------------------------------------------------------
# Unit tests — model chain
# ---------------------------------------------------------------------------


class TestMiniMaxModelChain:
    """Model fallback chain for MiniMax."""

    def test_model_chain_default(self):
        client = _make_minimax_client()
        assert client._model_chain == ["MiniMax-M2.5", "MiniMax-M2.5-highspeed"]

    def test_model_chain_custom_fallbacks(self):
        client = _make_minimax_client(
            primary_model="MiniMax-M2.7",
            fallback_models=["MiniMax-M2.5", "MiniMax-M2.5-highspeed"],
        )
        assert client._model_chain == [
            "MiniMax-M2.7",
            "MiniMax-M2.5",
            "MiniMax-M2.5-highspeed",
        ]


# ---------------------------------------------------------------------------
# Unit tests — raw call body structure
# ---------------------------------------------------------------------------


class TestMiniMaxRawCall:
    """Verify request body sent to MiniMax API."""

    def test_request_body_structure(self, monkeypatch):
        client = _make_minimax_client()
        captured: dict[str, Any] = {}

        def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
            captured["url"] = req.full_url
            captured["body"] = json.loads(req.data.decode("utf-8"))
            captured["headers"] = {k.lower(): v for k, v in req.headers.items()}
            return _DummyHTTPResponse(
                {
                    "model": "MiniMax-M2.5",
                    "choices": [{"message": {"content": "pong"}, "finish_reason": "stop"}],
                    "usage": {"prompt_tokens": 5, "completion_tokens": 1, "total_tokens": 6},
                }
            )

        monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
        resp = client._raw_call(
            "MiniMax-M2.5",
            [{"role": "user", "content": "ping"}],
            1024,
            0.5,
            False,
        )
        assert captured["url"] == "https://api.minimax.io/v1/chat/completions"
        assert captured["body"]["model"] == "MiniMax-M2.5"
        assert captured["body"]["temperature"] == 0.5
        assert captured["headers"]["authorization"] == "Bearer test-minimax-key"
        assert resp.content == "pong"
        assert resp.model == "MiniMax-M2.5"

    def test_json_mode_adds_response_format(self, monkeypatch):
        client = _make_minimax_client()
        captured: dict[str, Any] = {}

        def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
            captured["body"] = json.loads(req.data.decode("utf-8"))
            return _DummyHTTPResponse(
                {"choices": [{"message": {"content": "{}"}, "finish_reason": "stop"}]}
            )

        monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
        client._raw_call(
            "MiniMax-M2.5",
            [{"role": "user", "content": "json"}],
            1024,
            0.5,
            True,
        )
        assert captured["body"]["response_format"] == {"type": "json_object"}


# ---------------------------------------------------------------------------
# Unit tests — CLI provider registration
# ---------------------------------------------------------------------------


class TestMiniMaxCLI:
    """Verify MiniMax is in the CLI interactive provider menu."""

    def test_minimax_in_provider_choices(self):
        from researchclaw.cli import _PROVIDER_CHOICES

        found = any(v[0] == "minimax" for v in _PROVIDER_CHOICES.values())
        assert found, "minimax not found in _PROVIDER_CHOICES"

    def test_minimax_in_provider_urls(self):
        from researchclaw.cli import _PROVIDER_URLS

        assert _PROVIDER_URLS["minimax"] == "https://api.minimax.io/v1"

    def test_minimax_in_provider_models(self):
        from researchclaw.cli import _PROVIDER_MODELS

        primary, fallbacks = _PROVIDER_MODELS["minimax"]
        assert primary == "MiniMax-M2.5"
        assert "MiniMax-M2.5-highspeed" in fallbacks


# ---------------------------------------------------------------------------
# Unit tests — factory function
# ---------------------------------------------------------------------------


class TestMiniMaxFactory:
    """Verify create_llm_client dispatches correctly for MiniMax."""

    def test_create_llm_client_returns_llm_client(self):
        from researchclaw.config import LlmConfig, RCConfig

        rc_config = SimpleNamespace(
            llm=SimpleNamespace(
                provider="minimax",
                base_url="",
                api_key="mk-factory-test",
                api_key_env="",
                primary_model="MiniMax-M2.5",
                fallback_models=(),
            ),
        )
        client = create_llm_client(rc_config)
        assert isinstance(client, LLMClient)
        assert client.config.base_url == "https://api.minimax.io/v1"
        assert client._anthropic is None  # Not anthropic


# ---------------------------------------------------------------------------
# Unit tests — chat fallback with MiniMax models
# ---------------------------------------------------------------------------


class TestMiniMaxChatFallback:
    """Verify fallback works with MiniMax models."""

    def test_fallback_to_highspeed_on_primary_failure(self, monkeypatch):
        client = _make_minimax_client()
        calls: list[str] = []

        def fake_call_with_retry(
            self,
            model: str,
            messages: list[dict[str, str]],
            max_tokens: int,
            temperature: float,
            json_mode: bool,
        ) -> LLMResponse:
            calls.append(model)
            if model == "MiniMax-M2.5":
                raise RuntimeError("rate limited")
            return LLMResponse(content="ok", model=model)

        monkeypatch.setattr(LLMClient, "_call_with_retry", fake_call_with_retry)
        resp = client.chat([{"role": "user", "content": "test"}])
        assert calls == ["MiniMax-M2.5", "MiniMax-M2.5-highspeed"]
        assert resp.model == "MiniMax-M2.5-highspeed"


# ---------------------------------------------------------------------------
# Integration tests — live MiniMax API (skipped without key)
# ---------------------------------------------------------------------------


@pytest.mark.skipif(
    not os.environ.get("MINIMAX_API_KEY"),
    reason="MINIMAX_API_KEY not set",
)
class TestMiniMaxLiveAPI:
    """Integration tests against the real MiniMax API."""

    def _live_client(self) -> LLMClient:
        return LLMClient(
            LLMConfig(
                base_url="https://api.minimax.io/v1",
                api_key=os.environ["MINIMAX_API_KEY"],
                primary_model="MiniMax-M2.5",
                fallback_models=["MiniMax-M2.5-highspeed"],
                max_tokens=64,
                timeout_sec=60,
            )
        )

    def test_simple_chat_completion(self):
        client = self._live_client()
        resp = client.chat(
            [{"role": "user", "content": "Say 'hello' and nothing else."}],
            max_tokens=16,
            temperature=0.1,
        )
        assert resp.content.strip(), "empty response"
        assert "hello" in resp.content.lower()

    def test_json_mode(self):
        client = self._live_client()
        resp = client.chat(
            [
                {"role": "system", "content": "You are a helpful assistant that responds in JSON."},
                {"role": "user", "content": 'Return a JSON object with key "status" set to "ok".'},
            ],
            max_tokens=128,
            temperature=0.1,
            json_mode=True,
            strip_thinking=True,
        )
        # MiniMax M2.5 may wrap JSON in markdown code fences
        import re
        text = resp.content.strip()
        fence_match = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL)
        if fence_match:
            text = fence_match.group(1).strip()
        parsed = json.loads(text)
        assert "status" in parsed

    def test_preflight_check(self):
        client = self._live_client()
        ok, msg = client.preflight()
        assert ok, f"preflight failed: {msg}"


================================================
FILE: tests/test_neuroscience_domain.py
================================================
"""Tests for computational neuroscience domain support.

Covers profile loading, keyword detection, adapter dispatch, and
prompt block generation for neuroscience_computational and
neuroscience_imaging domains.
"""

from __future__ import annotations

import pytest

from researchclaw.domains.detector import (
    DomainProfile,
    detect_domain,
    detect_domain_id,
    get_profile,
    _keyword_detect,
    _profile_cache,
)
from researchclaw.domains.prompt_adapter import (
    MLPromptAdapter,
    PromptBlocks,
    get_adapter,
)


# ---------------------------------------------------------------------------
# Profile loading
# ---------------------------------------------------------------------------


class TestNeuroscienceProfiles:
    def setup_method(self):
        _profile_cache.clear()

    def test_computational_profile_exists(self):
        profile = get_profile("neuroscience_computational")
        assert profile is not None
        assert profile.domain_id == "neuroscience_computational"
        assert profile.display_name == "Computational Neuroscience"

    def test_computational_profile_fields(self):
        profile = get_profile("neuroscience_computational")
        assert profile is not None
        assert profile.experiment_paradigm == "simulation"
        assert "brian2" in profile.core_libraries
        assert "numpy" in profile.core_libraries
        assert profile.gpu_required is False

    def test_computational_profile_baselines(self):
        profile = get_profile("neuroscience_computational")
        assert profile is not None
        assert len(profile.standard_baselines) >= 2
        assert any("LIF" in b or "Integrate-and-Fire" in b
                    for b in profile.standard_baselines)

    def test_imaging_profile_exists(self):
        profile = get_profile("neuroscience_imaging")
        assert profile is not None
        assert profile.domain_id == "neuroscience_imaging"
        assert profile.display_name == "Brain Imaging Analysis"

    def test_imaging_profile_fields(self):
        profile = get_profile("neuroscience_imaging")
        assert profile is not None
        assert profile.experiment_paradigm == "comparison"
        assert "nilearn" in profile.core_libraries
        assert "mne" in profile.core_libraries


# ---------------------------------------------------------------------------
# Keyword detection
# ---------------------------------------------------------------------------


class TestNeuroscienceKeywordDetection:
    def test_spiking_network(self):
        assert _keyword_detect("spiking neural model of cortical columns") == "neuroscience_computational"

    def test_brian2(self):
        assert _keyword_detect("network model implemented in brian2") == "neuroscience_computational"

    def test_hodgkin_huxley(self):
        assert _keyword_detect("Hodgkin-Huxley neuron model") == "neuroscience_computational"

    def test_integrate_and_fire(self):
        assert _keyword_detect("leaky integrate-and-fire model") == "neuroscience_computational"

    def test_izhikevich(self):
        assert _keyword_detect("Izhikevich neuron dynamics") == "neuroscience_computational"

    def test_neural_decoding(self):
        assert _keyword_detect("neural decoding of population coding in cortex") == "neuroscience_computational"

    def test_firing_rate(self):
        assert _keyword_detect("firing rate analysis of cortical neurons") == "neuroscience_computational"

    def test_fmri(self):
        assert _keyword_detect("fmri resting state analysis") == "neuroscience_imaging"

    def test_eeg(self):
        assert _keyword_detect("EEG classification for BCI") == "neuroscience_imaging"

    def test_nilearn(self):
        assert _keyword_detect("brain parcellation with nilearn") == "neuroscience_imaging"

    def test_mne_python(self):
        assert _keyword_detect("ERP analysis using mne-python") == "neuroscience_imaging"

    def test_generic_neuroscience(self):
        result = _keyword_detect("neuroscience of learning and memory")
        assert result == "neuroscience_computational"

    def test_detect_domain_integration(self):
        profile = detect_domain("brian2 spiking neural model of cortical microcircuits")
        assert profile.domain_id == "neuroscience_computational"

    def test_detect_domain_id_shortcut(self):
        domain_id = detect_domain_id("brian2 leaky integrate-and-fire cortical model")
        assert domain_id == "neuroscience_computational"


# ---------------------------------------------------------------------------
# Adapter dispatch
# ---------------------------------------------------------------------------


class TestNeuroscienceAdapter:
    def test_computational_gets_neuroscience_adapter(self):
        profile = get_profile("neuroscience_computational")
        if profile is None:
            pytest.skip("neuroscience_computational profile not found")
        adapter = get_adapter(profile)
        assert not isinstance(adapter, MLPromptAdapter)
        from researchclaw.domains.adapters.neuroscience import (
            NeurosciencePromptAdapter,
        )
        assert isinstance(adapter, NeurosciencePromptAdapter)

    def test_imaging_gets_neuroscience_adapter(self):
        profile = get_profile("neuroscience_imaging")
        if profile is None:
            pytest.skip("neuroscience_imaging profile not found")
        adapter = get_adapter(profile)
        assert not isinstance(adapter, MLPromptAdapter)

    def test_code_generation_blocks_nonempty(self):
        profile = get_profile("neuroscience_computational")
        if profile is None:
            pytest.skip("neuroscience_computational profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints
        assert blocks.dataset_guidance
        assert blocks.output_format_guidance

    def test_experiment_design_blocks(self):
        profile = get_profile("neuroscience_computational")
        if profile is None:
            pytest.skip("neuroscience_computational profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_experiment_design_blocks({})
        assert "neuroscience" in blocks.experiment_design_context.lower() or \
               "Computational Neuroscience" in blocks.experiment_design_context
        assert blocks.statistical_test_guidance

    def test_result_analysis_blocks(self):
        profile = get_profile("neuroscience_computational")
        if profile is None:
            pytest.skip("neuroscience_computational profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_result_analysis_blocks({})
        assert "firing rate" in blocks.result_analysis_hints.lower()

    def test_blueprint_context(self):
        profile = get_profile("neuroscience_computational")
        if profile is None:
            pytest.skip("neuroscience_computational profile not found")
        adapter = get_adapter(profile)
        ctx = adapter.get_blueprint_context()
        # Should include file structure and libraries from the profile
        if profile.typical_file_structure:
            assert "network.py" in ctx or "neuron.py" in ctx
        if profile.core_libraries:
            assert "brian2" in ctx or "numpy" in ctx


================================================
FILE: tests/test_opencode_bridge.py
================================================
"""Tests for OpenCode Beast Mode bridge."""

from __future__ import annotations

import json
import subprocess
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.config import OpenCodeConfig, _parse_opencode_config
from researchclaw.pipeline.opencode_bridge import (
    ComplexityScore,
    OpenCodeBridge,
    OpenCodeResult,
    count_historical_failures,
    score_complexity,
)


# ============================================================
# TestComplexityScorer
# ============================================================


class TestComplexityScorer:
    """Tests for complexity scoring logic."""

    def test_low_complexity_simple_classification(self):
        plan = (
            "Train a ResNet-18 on CIFAR-10 with SGD optimizer.\n"
            "Report test accuracy as the primary metric.\n"
            "condition_0: baseline (lr=0.1)\n"
            "condition_1: ablation (lr=0.01)\n"
        )
        result = score_complexity(plan, topic="Image classification on CIFAR-10")
        assert result.score < 0.4
        assert result.recommendation == "code_agent"

    def test_high_complexity_multimodal_gan(self):
        plan = (
            "Implement a vision-language GAN with the following components:\n"
            "- Encoder: ViT-based image encoder\n"
            "- Decoder: Transformer text decoder\n"
            "- Generator: produces synthetic image-text pairs\n"
            "- Discriminator: classifies real vs fake\n"
            "- Critic: provides auxiliary reward signal\n"
            "Multiple files needed: model.py, trainer.py, dataset.py\n"
            "condition_0: baseline\n"
            "condition_1: ablation without critic\n"
            "condition_2: ablation without encoder pretraining\n"
            "condition_3: ablation with reduced generator\n"
            "Custom loss function and custom layer for cross-modal attention.\n"
        )
        result = score_complexity(
            plan, topic="Multi-modal GAN for vision-language synthesis"
        )
        assert result.score > 0.6
        assert result.recommendation == "beast_mode"

    def test_historical_failures_boost_score(self):
        plan = (
            "Train a simple model with encoder and decoder.\n"
            "condition_0: baseline\n"
        )
        score_without = score_complexity(plan, topic="test", historical_failures=0)
        score_with = score_complexity(plan, topic="test", historical_failures=3)
        assert score_with.score > score_without.score
        assert score_with.signals["historical_failure"] > 0

    def test_empty_plan_returns_zero(self):
        result = score_complexity("", topic="")
        assert result.score == 0.0
        assert result.recommendation == "legacy"
        assert result.reason == "Empty plan"

    def test_threshold_boundary(self):
        """A plan scoring exactly at threshold should recommend beast_mode."""
        plan = (
            "Multi-modal diffusion model with encoder, decoder, discriminator.\n"
            "Custom loss, custom layer, wrapper pattern.\n"
            "model.py, trainer.py needed.\n"
        )
        # Use a low threshold to ensure it triggers
        result = score_complexity(plan, topic="Diffusion model", threshold=0.2)
        assert result.recommendation == "beast_mode"

        # Use a very high threshold to ensure it doesn't trigger
        result2 = score_complexity(plan, topic="Diffusion model", threshold=0.99)
        assert result2.recommendation == "code_agent"

    def test_signals_all_present(self):
        result = score_complexity("some plan", topic="some topic")
        expected_keys = {
            "component_count",
            "file_count_hint",
            "domain_complexity",
            "condition_count",
            "historical_failure",
            "dependency_depth",
        }
        assert set(result.signals.keys()) == expected_keys

    def test_score_clamped_to_unit_interval(self):
        """Score should never exceed 1.0 even with extreme inputs."""
        plan = " ".join(
            ["encoder decoder discriminator generator critic actor teacher student"] * 10
            + ["model.py trainer.py dataset.py multiple files modular"] * 10
            + ["multi-modal distributed GAN diffusion NeRF MoE meta-learning"] * 10
            + ["condition_1 condition_2 condition_3 ablation_4 variant_5 baseline"] * 10
            + ["custom layer custom loss wrapper registry hook callback"] * 10
        )
        result = score_complexity(plan, topic="everything", historical_failures=100)
        assert 0.0 <= result.score <= 1.0

    def test_domain_complexity_keywords(self):
        plan = "Implement a physics-informed neural network (PINN) with neural ODE solver."
        result = score_complexity(plan, topic="PINN for fluid dynamics")
        assert result.signals["domain_complexity"] > 0


# ============================================================
# TestOpenCodeBridge
# ============================================================


class TestOpenCodeBridge:
    """Tests for the OpenCode bridge class."""

    def test_check_available_returns_false_when_not_installed(self):
        with patch(
            "researchclaw.pipeline.opencode_bridge.shutil.which",
            return_value=None,
        ):
            assert OpenCodeBridge.check_available() is False

    def test_check_available_returns_false_on_timeout(self):
        with patch(
            "researchclaw.pipeline.opencode_bridge.shutil.which",
            return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd",
        ), patch(
            "researchclaw.pipeline.opencode_bridge.subprocess.run",
            side_effect=subprocess.TimeoutExpired(cmd="opencode", timeout=15),
        ):
            assert OpenCodeBridge.check_available() is False

    def test_check_available_returns_true(self):
        mock_result = MagicMock()
        mock_result.returncode = 0
        with patch(
            "researchclaw.pipeline.opencode_bridge.shutil.which",
            return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd",
        ), patch(
            "researchclaw.pipeline.opencode_bridge.subprocess.run",
            return_value=mock_result,
        ) as run_mock:
            assert OpenCodeBridge.check_available() is True
        assert run_mock.call_args.args[0][0].endswith("opencode.cmd")

    def test_workspace_creates_correct_files(self, tmp_path):
        bridge = OpenCodeBridge(
            model="gpt-5.2",
            llm_base_url="https://example.com",
            api_key_env="TEST_KEY",
        )
        ws = bridge._prepare_workspace(
            stage_dir=tmp_path,
            topic="Test topic",
            exp_plan="plan: test",
            metric="accuracy",
            pkg_hint="torch available",
            extra_guidance="Be careful",
            time_budget_sec=300,
        )
        assert (ws / "EXPERIMENT_PLAN.yaml").exists()
        assert (ws / "GUIDANCE.md").exists()
        assert (ws / "opencode.json").exists()

        guidance = (ws / "GUIDANCE.md").read_text()
        assert "Test topic" in guidance
        assert "accuracy" in guidance

    def test_opencode_config_azure_format(self, tmp_path):
        bridge = OpenCodeBridge(
            model="gpt-5.2",
            llm_base_url="https://huaxi.openai.azure.com/openai/v1",
            api_key_env="AZURE_OPENAI_API_KEY",
            llm_provider="azure",
        )
        ws = bridge._prepare_workspace(
            stage_dir=tmp_path,
            topic="t",
            exp_plan="p",
            metric="m",
            pkg_hint="",
            extra_guidance="",
            time_budget_sec=300,
        )
        cfg = json.loads((ws / "opencode.json").read_text())
        # Azure now uses the unified "openai" provider (Bearer token auth
        # works on Azure endpoints and Responses API is supported)
        assert cfg["model"] == "openai/gpt-5.2"
        assert "provider" in cfg
        assert "openai" in cfg["provider"]
        assert cfg["provider"]["openai"]["options"]["baseURL"] == "https://huaxi.openai.azure.com/openai/v1"
        assert "{env:AZURE_OPENAI_API_KEY}" in cfg["provider"]["openai"]["options"]["apiKey"]

    def test_opencode_config_openai_format(self, tmp_path):
        bridge = OpenCodeBridge(
            model="gpt-4o",
            llm_base_url="https://api.openai.com/v1",
            api_key_env="OPENAI_API_KEY",
        )
        ws = bridge._prepare_workspace(
            stage_dir=tmp_path,
            topic="t",
            exp_plan="p",
            metric="m",
            pkg_hint="",
            extra_guidance="",
            time_budget_sec=300,
        )
        cfg = json.loads((ws / "opencode.json").read_text())
        assert cfg["model"] == "openai/gpt-4o"
        assert "openai" in cfg["provider"]

    def test_opencode_config_preserves_prefixed_model(self, tmp_path):
        """Model with '/' prefix (e.g. anthropic/...) should NOT get double-prefixed (BUG-C fix)."""
        bridge = OpenCodeBridge(
            model="anthropic/claude-sonnet-4-6",
            llm_base_url="https://huaxi.openai.azure.com/openai/v1",
            api_key_env="AZURE_API_KEY",
            llm_provider="azure",
        )
        ws = bridge._prepare_workspace(
            stage_dir=tmp_path,
            topic="t",
            exp_plan="p",
            metric="m",
            pkg_hint="",
            extra_guidance="",
            time_budget_sec=300,
        )
        cfg = json.loads((ws / "opencode.json").read_text())
        # Should be "anthropic/claude-sonnet-4-6", NOT "azure/anthropic/claude-sonnet-4-6"
        assert cfg["model"] == "anthropic/claude-sonnet-4-6"

    def test_resolve_model_azure_uses_openai_prefix(self):
        """Azure endpoint → uses openai/ prefix (Azure supports Responses API now)."""
        bridge = OpenCodeBridge(
            model="gpt-5.2",
            llm_base_url="https://huaxi.openai.azure.com/openai/v1",
            llm_provider="azure",
        )
        resolved = bridge._resolve_opencode_model()
        assert resolved == "openai/gpt-5.2"

    def test_resolve_model_preserves_explicit_prefix(self):
        """Model with '/' prefix should be used as-is regardless of provider."""
        bridge = OpenCodeBridge(
            model="anthropic/claude-sonnet-4-6",
            llm_base_url="https://huaxi.openai.azure.com/openai/v1",
            llm_provider="azure",
        )
        resolved = bridge._resolve_opencode_model()
        assert resolved == "anthropic/claude-sonnet-4-6"

    def test_resolve_model_no_model_default(self):
        """Empty model string → default Anthropic model."""
        bridge = OpenCodeBridge()
        assert bridge._resolve_opencode_model() == "anthropic/claude-sonnet-4-6"

    def test_collect_files_ignores_pycache(self, tmp_path):
        (tmp_path / "main.py").write_text("print('hello')")
        pycache = tmp_path / "__pycache__"
        pycache.mkdir()
        (pycache / "main.cpython-311.pyc").write_text("bytecode")
        # Also write a .py in pycache to test filtering
        (pycache / "cached.py").write_text("cached")

        files = OpenCodeBridge._collect_files(tmp_path)
        assert "main.py" in files
        assert not any("__pycache__" in k for k in files)

    def test_collect_files_includes_requirements(self, tmp_path):
        (tmp_path / "main.py").write_text("import torch")
        (tmp_path / "requirements.txt").write_text("torch>=2.0")
        files = OpenCodeBridge._collect_files(tmp_path)
        assert "requirements.txt" in files
        assert "main.py" in files

    def test_collect_files_flattens_subdirectories(self, tmp_path):
        """Files in subdirs should be flattened to basenames (BUG-D fix)."""
        src = tmp_path / "src"
        src.mkdir()
        (src / "model.py").write_text("class Model: pass")
        (src / "utils.py").write_text("def helper(): pass")
        (tmp_path / "main.py").write_text("from model import Model")
        files = OpenCodeBridge._collect_files(tmp_path)
        # Keys should be flat basenames, not paths like "src/model.py"
        assert "model.py" in files
        assert "utils.py" in files
        assert "main.py" in files
        assert not any("/" in k for k in files)

    def test_collect_files_root_takes_priority_over_subdir(self, tmp_path):
        """Root-level file wins when basename collides with subdir file."""
        (tmp_path / "main.py").write_text("root version")
        sub = tmp_path / "src"
        sub.mkdir()
        (sub / "main.py").write_text("subdir version")
        files = OpenCodeBridge._collect_files(tmp_path)
        assert files["main.py"] == "root version"

    def test_generate_returns_error_on_not_installed(self, tmp_path):
        bridge = OpenCodeBridge()
        with patch.object(OpenCodeBridge, "check_available", return_value=False):
            result = bridge.generate(
                stage_dir=tmp_path,
                topic="test",
                exp_plan="plan",
                metric="acc",
            )
        assert not result.success
        assert "not installed" in result.error

    def test_generate_returns_error_on_cli_failure(self, tmp_path):
        bridge = OpenCodeBridge(max_retries=0, workspace_cleanup=True)

        with patch.object(OpenCodeBridge, "check_available", return_value=True), \
             patch.object(
                 bridge,
                 "_invoke_opencode",
                 return_value=(False, "CLI error", 1.5),
             ):
            result = bridge.generate(
                stage_dir=tmp_path,
                topic="test",
                exp_plan="plan",
                metric="acc",
            )
        assert not result.success
        assert "failed" in result.error.lower()

    def test_generate_success(self, tmp_path):
        bridge = OpenCodeBridge(max_retries=0, workspace_cleanup=False)

        def fake_invoke(workspace, prompt):
            # Write main.py into the workspace to simulate OpenCode output
            (workspace / "main.py").write_text("print('acc: 0.95')")
            (workspace / "requirements.txt").write_text("torch")
            return True, "success", 5.0

        with patch.object(OpenCodeBridge, "check_available", return_value=True), \
             patch.object(bridge, "_invoke_opencode", side_effect=fake_invoke):
            result = bridge.generate(
                stage_dir=tmp_path,
                topic="test",
                exp_plan="plan",
                metric="acc",
            )
        assert result.success
        assert "main.py" in result.files
        assert result.elapsed_sec == 5.0

    def test_invoke_opencode_uses_resolved_path(self, tmp_path):
        bridge = OpenCodeBridge(model="gpt-5.2", timeout_sec=10)
        mock_result = MagicMock()
        mock_result.returncode = 0
        mock_result.stdout = "{}"
        mock_result.stderr = ""

        with patch(
            "researchclaw.pipeline.opencode_bridge.shutil.which",
            return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd",
        ), patch(
            "researchclaw.pipeline.opencode_bridge.subprocess.run",
            return_value=mock_result,
        ) as run_mock:
            success, _log, _elapsed = bridge._invoke_opencode(tmp_path, "test prompt")

        assert success is True
        assert run_mock.call_args.args[0][0].endswith("opencode.cmd")


# ============================================================
# TestEnsureMainEntryPoint (BUG-R52-01)
# ============================================================


class TestHasMainGuard:
    """Tests for _has_main_guard static method."""

    def test_with_guard(self):
        code = 'def main():\n    pass\n\nif __name__ == "__main__":\n    main()\n'
        assert OpenCodeBridge._has_main_guard(code) is True

    def test_without_guard(self):
        code = "def main():\n    pass\n"
        assert OpenCodeBridge._has_main_guard(code) is False

    def test_syntax_error(self):
        assert OpenCodeBridge._has_main_guard("def broken(") is False

    def test_empty(self):
        assert OpenCodeBridge._has_main_guard("") is False

    def test_single_quote_guard(self):
        code = "if __name__ == '__main__':\n    print('hi')\n"
        assert OpenCodeBridge._has_main_guard(code) is True


class TestEnsureMainEntryPoint:
    """Tests for _ensure_main_entry_point — BUG-R52-01 fix."""

    def test_already_has_guard_unchanged(self):
        files = {
            "main.py": 'def run():\n    pass\n\nif __name__ == "__main__":\n    run()\n',
            "utils.py": "def helper(): pass\n",
        }
        result = OpenCodeBridge._ensure_main_entry_point(files)
        assert result is files  # Same object, unchanged

    def test_no_main_py_unchanged(self):
        files = {"utils.py": "def helper(): pass\n"}
        result = OpenCodeBridge._ensure_main_entry_point(files)
        assert result is files

    def test_swap_entry_point_from_other_file(self):
        """When main.py is library-only and another file has __main__, swap."""
        lib_code = "class Model:\n    pass\n\ndef train(model):\n    pass\n"
        entry_code = (
            'from main import Model, train\n\n'
            'if __name__ == "__main__":\n'
            '    m = Model()\n'
            '    train(m)\n'
        )
        files = {
            "main.py": lib_code,
            "run_experiment.py": entry_code,
        }
        result = OpenCodeBridge._ensure_main_entry_point(files)
        # main.py should now contain the entry point code
        assert '__main__' in result["main.py"]
        # The old main.py content should be in run_experiment.py
        assert result["run_experiment.py"] == lib_code

    def test_inject_entry_for_main_function(self):
        """When main.py defines main() but no guard, inject one."""
        code = "import torch\n\ndef main():\n    print('training')\n"
        files = {"main.py": code}
        result = OpenCodeBridge._ensure_main_entry_point(files)
        assert '__main__' in result["main.py"]
        assert "main()" in result["main.py"]

    def test_inject_entry_for_run_function(self):
        """Should also detect run(), train(), etc."""
        code = "def run_experiment():\n    print('running')\n"
        files = {"main.py": code}
        result = OpenCodeBridge._ensure_main_entry_point(files)
        assert '__main__' in result["main.py"]
        assert "run_experiment()" in result["main.py"]

    def test_no_known_entry_function_warns(self):
        """When no known entry function exists, return unchanged with warning."""
        code = "class Config:\n    x = 1\n\nclass Trainer:\n    pass\n"
        files = {"main.py": code}
        result = OpenCodeBridge._ensure_main_entry_point(files)
        # Should return unchanged since no entry function found
        assert result["main.py"] == code

    def test_non_py_files_not_checked(self):
        """requirements.txt and setup.py should not be checked for __main__."""
        lib_code = "class Model:\n    pass\n"
        files = {
            "main.py": lib_code,
            "requirements.txt": "torch>=2.0\n",
            "setup.py": "# setup\n",
        }
        result = OpenCodeBridge._ensure_main_entry_point(files)
        # No swap should occur — only .py files are checked
        assert result["main.py"] == lib_code

    def test_swap_preserves_other_files(self):
        """Swapping should not lose any files from the dict."""
        files = {
            "main.py": "class Lib: pass\n",
            "run.py": 'if __name__ == "__main__":\n    print("go")\n',
            "utils.py": "def helper(): pass\n",
            "requirements.txt": "numpy\n",
        }
        result = OpenCodeBridge._ensure_main_entry_point(files)
        assert len(result) == len(files)
        assert "utils.py" in result
        assert "requirements.txt" in result


# ============================================================
# TestOpenCodeConfig
# ============================================================


class TestOpenCodeConfig:
    """Tests for OpenCodeConfig dataclass and parser."""

    def test_default_values(self):
        cfg = OpenCodeConfig()
        assert cfg.enabled is True
        assert cfg.auto is True
        assert cfg.complexity_threshold == 0.2
        assert cfg.model == ""
        assert cfg.timeout_sec == 600
        assert cfg.max_retries == 1
        assert cfg.workspace_cleanup is True

    def test_parse_from_dict(self):
        data = {
            "enabled": True,
            "auto": True,
            "complexity_threshold": 0.5,
            "model": "gpt-5.2",
            "timeout_sec": 900,
            "max_retries": 2,
            "workspace_cleanup": False,
        }
        cfg = _parse_opencode_config(data)
        assert cfg.enabled is True
        assert cfg.auto is True
        assert cfg.complexity_threshold == 0.5
        assert cfg.model == "gpt-5.2"
        assert cfg.timeout_sec == 900
        assert cfg.max_retries == 2
        assert cfg.workspace_cleanup is False

    def test_empty_dict_returns_default(self):
        cfg = _parse_opencode_config({})
        assert cfg == OpenCodeConfig()


# ============================================================
# TestCountHistoricalFailures
# ============================================================


class TestCountHistoricalFailures:
    def test_no_failures(self, tmp_path):
        assert count_historical_failures(tmp_path) == 0

    def test_counts_beast_mode_failures(self, tmp_path):
        d = tmp_path / "stage-10_001"
        d.mkdir()
        (d / "beast_mode_log.json").write_text(json.dumps({"success": False}))
        assert count_historical_failures(tmp_path) >= 1

    def test_counts_validation_failures(self, tmp_path):
        d = tmp_path / "stage-10_002"
        d.mkdir()
        (d / "validation_report.md").write_text("**Status**: FAILED after 5 repairs")
        assert count_historical_failures(tmp_path) >= 1

    def test_deduplicates_multiple_failure_indicators(self, tmp_path):
        """Same dir with beast_mode_log + stage_health + validation_report = 1 failure (BUG-E fix)."""
        d = tmp_path / "stage-10_003"
        d.mkdir()
        (d / "beast_mode_log.json").write_text(json.dumps({"success": False}))
        (d / "stage_health.json").write_text(json.dumps({"status": "FAILED"}))
        (d / "validation_report.md").write_text("FAILED after 3 repairs")
        assert count_historical_failures(tmp_path) == 1


================================================
FILE: tests/test_overleaf.py
================================================
"""Tests for Overleaf sync (C4): Sync engine, Conflict resolver, Watcher, Formatter."""

from __future__ import annotations

import textwrap
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.overleaf.sync import OverleafSync
from researchclaw.overleaf.conflict import ConflictResolver, _extract_conflicts, _resolve_content
from researchclaw.overleaf.watcher import FileWatcher
from researchclaw.overleaf.formatter import LatexFormatter


# ══════════════════════════════════════════════════════════════════
# ConflictResolver tests
# ══════════════════════════════════════════════════════════════════


class TestConflictResolver:
    def test_no_conflicts(self, tmp_path: Path) -> None:
        (tmp_path / "paper.tex").write_text("\\section{Intro}\nHello world\n")
        resolver = ConflictResolver()
        assert not resolver.has_conflicts(tmp_path)

    def test_has_conflicts(self, tmp_path: Path) -> None:
        content = textwrap.dedent("""\
            \\section{Intro}
            <<<<<<< HEAD
            Our method is great.
            =======
            Our method is good.
            >>>>>>> remote
        """)
        (tmp_path / "paper.tex").write_text(content)
        resolver = ConflictResolver()
        assert resolver.has_conflicts(tmp_path)

    def test_detect_conflicts(self, tmp_path: Path) -> None:
        content = textwrap.dedent("""\
            <<<<<<< HEAD
            line A
            =======
            line B
            >>>>>>> remote
        """)
        (tmp_path / "main.tex").write_text(content)
        resolver = ConflictResolver()
        conflicts = resolver.detect(tmp_path)
        assert len(conflicts) == 1
        assert conflicts[0]["ours"] == "line A"
        assert conflicts[0]["theirs"] == "line B"

    def test_resolve_ours(self, tmp_path: Path) -> None:
        content = textwrap.dedent("""\
            \\section{Intro}
            <<<<<<< HEAD
            AI version
            =======
            Human version
            >>>>>>> remote
            \\section{End}
        """)
        (tmp_path / "paper.tex").write_text(content)
        resolver = ConflictResolver()
        resolved = resolver.resolve(tmp_path, strategy="ours")
        assert len(resolved) == 1
        text = (tmp_path / "paper.tex").read_text()
        assert "AI version" in text
        assert "Human version" not in text
        assert "<<<<<<" not in text

    def test_resolve_theirs(self, tmp_path: Path) -> None:
        content = textwrap.dedent("""\
            <<<<<<< HEAD
            AI text
            =======
            Human text
            >>>>>>> remote
        """)
        (tmp_path / "paper.tex").write_text(content)
        resolver = ConflictResolver()
        resolver.resolve(tmp_path, strategy="theirs")
        text = (tmp_path / "paper.tex").read_text()
        assert "Human text" in text
        assert "AI text" not in text

    def test_multiple_conflicts(self, tmp_path: Path) -> None:
        content = textwrap.dedent("""\
            <<<<<<< HEAD
            A1
            =======
            B1
            >>>>>>> remote
            middle
            <<<<<<< HEAD
            A2
            =======
            B2
            >>>>>>> remote
        """)
        (tmp_path / "paper.tex").write_text(content)
        resolver = ConflictResolver()
        conflicts = resolver.detect(tmp_path)
        assert len(conflicts) == 2


class TestConflictHelpers:
    def test_extract_conflicts_empty(self) -> None:
        assert _extract_conflicts("no conflicts here") == []

    def test_resolve_content_ours(self) -> None:
        content = "<<<<<<< HEAD\nours\n=======\ntheirs\n>>>>>>> remote\n"
        resolved = _resolve_content(content, "ours")
        assert "ours" in resolved
        assert "theirs" not in resolved

    def test_resolve_content_theirs(self) -> None:
        content = "<<<<<<< HEAD\nours\n=======\ntheirs\n>>>>>>> remote\n"
        resolved = _resolve_content(content, "theirs")
        assert "theirs" in resolved
        assert "ours" not in resolved


# ══════════════════════════════════════════════════════════════════
# FileWatcher tests
# ══════════════════════════════════════════════════════════════════


class TestFileWatcher:
    def test_no_changes_initially(self, tmp_path: Path) -> None:
        (tmp_path / "paper.tex").write_text("content")
        watcher = FileWatcher(tmp_path)
        assert watcher.check_changes() == []

    def test_detect_new_file(self, tmp_path: Path) -> None:
        watcher = FileWatcher(tmp_path)
        (tmp_path / "new.tex").write_text("new content")
        changes = watcher.check_changes()
        assert "new.tex" in changes

    def test_detect_modified_file(self, tmp_path: Path) -> None:
        f = tmp_path / "paper.tex"
        f.write_text("v1")
        watcher = FileWatcher(tmp_path)
        # Modify
        import time
        time.sleep(0.05)
        f.write_text("v2")
        changes = watcher.check_changes()
        assert "paper.tex" in changes

    def test_detect_deleted_file(self, tmp_path: Path) -> None:
        f = tmp_path / "paper.tex"
        f.write_text("content")
        watcher = FileWatcher(tmp_path)
        f.unlink()
        changes = watcher.check_changes()
        assert "paper.tex" in changes

    def test_only_watches_extensions(self, tmp_path: Path) -> None:
        watcher = FileWatcher(tmp_path, extensions=(".tex",))
        (tmp_path / "readme.md").write_text("markdown")
        changes = watcher.check_changes()
        assert changes == []

    def test_nonexistent_dir(self, tmp_path: Path) -> None:
        watcher = FileWatcher(tmp_path / "nonexistent")
        assert watcher.check_changes() == []


# ══════════════════════════════════════════════════════════════════
# LatexFormatter tests
# ══════════════════════════════════════════════════════════════════


class TestLatexFormatter:
    def test_normalize_paths(self) -> None:
        content = r"\includegraphics[width=0.5\textwidth]{/home/user/artifacts/rc-123/figures/plot.png}"
        result = LatexFormatter.normalize_paths(content)
        assert "figures/plot.png" in result
        assert "/home/user" not in result

    def test_ensure_document_class_adds(self) -> None:
        content = "\\begin{document}\nHello\n\\end{document}"
        result = LatexFormatter.ensure_document_class(content)
        assert "\\documentclass" in result

    def test_ensure_document_class_noop(self) -> None:
        content = "\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}"
        result = LatexFormatter.ensure_document_class(content)
        assert result.count("\\documentclass") == 1

    def test_strip_local_comments(self) -> None:
        content = "Normal line\n% RESEARCHCLAW: internal note\nAnother line\n"
        result = LatexFormatter.strip_local_comments(content)
        assert "RESEARCHCLAW" not in result
        assert "Normal line" in result
        assert "Another line" in result

    def test_fix_encoding(self) -> None:
        content = "\\documentclass{article}\n\\begin{document}\n"
        result = LatexFormatter.fix_encoding(content)
        assert "\\usepackage[utf8]{inputenc}" in result

    def test_fix_encoding_noop(self) -> None:
        content = "\\documentclass{article}\n\\usepackage[utf8]{inputenc}\n\\begin{document}\n"
        result = LatexFormatter.fix_encoding(content)
        assert result.count("inputenc") == 1

    def test_format_for_overleaf(self, tmp_path: Path) -> None:
        tex = tmp_path / "paper.tex"
        tex.write_text("\\documentclass{article}\n% RESEARCHCLAW: test\n\\begin{document}\nHello\n\\end{document}\n")
        formatter = LatexFormatter()
        result = formatter.format_for_overleaf(tex)
        assert "RESEARCHCLAW" not in result
        assert "inputenc" in result


# ══════════════════════════════════════════════════════════════════
# OverleafSync tests (mock git)
# ══════════════════════════════════════════════════════════════════


class TestOverleafSync:
    def test_init(self) -> None:
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        assert sync.git_url == "https://git.overleaf.com/abc123"
        assert sync.branch == "main"
        assert sync.local_dir is None

    def test_get_status_before_setup(self) -> None:
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        status = sync.get_status()
        assert status["local_dir"] is None
        assert status["last_sync"] is None

    def test_push_before_setup_raises(self, tmp_path: Path) -> None:
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        with pytest.raises(RuntimeError, match="setup"):
            sync.push_paper(tmp_path / "paper.tex")

    def test_pull_before_setup_raises(self) -> None:
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        with pytest.raises(RuntimeError, match="setup"):
            sync.pull_changes()

    def test_resolve_before_setup_raises(self) -> None:
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        with pytest.raises(RuntimeError, match="setup"):
            sync.resolve_conflicts()

    @patch("researchclaw.overleaf.sync.subprocess.run")
    def test_setup_clones(self, mock_run: MagicMock, tmp_path: Path) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
        sync = OverleafSync(git_url="https://git.overleaf.com/abc123")
        local = sync.setup(tmp_path)
        assert local == tmp_path / "overleaf_repo"
        # git clone was called
        mock_run.assert_called_once()
        args = mock_run.call_args[0][0]
        assert "clone" in args


================================================
FILE: tests/test_paper_verifier.py
================================================
"""Tests for paper_verifier — post-generation fabrication detection."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.pipeline.paper_verifier import (
    VerificationResult,
    verify_paper,
)
from researchclaw.pipeline.verified_registry import VerifiedRegistry

ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts"

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _make_registry(**kwargs) -> VerifiedRegistry:
    summary = {"best_run": {"metrics": {}}, "condition_summaries": {}, "metrics_summary": {}}
    conditions = kwargs.get("conditions", {})
    for cond_name, seeds in conditions.items():
        for seed_idx, value in seeds.items():
            summary["best_run"]["metrics"][f"{cond_name}/{seed_idx}/metric"] = value
        mean_val = sum(seeds.values()) / len(seeds)
        summary["condition_summaries"][cond_name] = {"metrics": {"metric": mean_val}}
    pm = kwargs.get("primary_metric")
    if pm is not None:
        summary["best_run"]["metrics"]["primary_metric"] = pm
    return VerifiedRegistry.from_experiment(summary)


# ---------------------------------------------------------------------------
# Unit tests — clean paper
# ---------------------------------------------------------------------------


class TestCleanPaper:
    def test_all_numbers_verified_passes(self):
        reg = _make_registry(
            conditions={"Baseline": {0: 80.0, 1: 82.0}, "Proposed": {0: 90.0, 1: 92.0}},
            primary_metric=91.0,
        )
        tex = r"""
\section{Results}
Our proposed method achieves 91.0000 on the primary metric,
compared to 81.0000 for the baseline.
\begin{table}[htbp]
\centering
\begin{tabular}{lcc}
\toprule
Method & Metric & $n$ \\
\midrule
Baseline & 81.0000 $\pm$ 1.4142 & 2 \\
Proposed & 91.0000 $\pm$ 1.4142 & 2 \\
\bottomrule
\end{tabular}
\end{table}
"""
        result = verify_paper(tex, reg)
        assert result.severity == "PASS"
        assert result.strict_violations == 0

    def test_common_constants_allowed(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Experimental Setup}
We use a batch size of 64 and train for 100 epochs
with a learning rate of 0.001.
"""
        result = verify_paper(tex, reg)
        assert result.severity == "PASS"

    def test_year_numbers_allowed(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Introduction}
Following the work of Smith et al. (2025), we propose...
"""
        result = verify_paper(tex, reg)
        assert result.severity == "PASS"


# ---------------------------------------------------------------------------
# Unit tests — fabricated numbers
# ---------------------------------------------------------------------------


class TestFabricatedNumbers:
    def test_fabricated_in_results_rejects(self):
        reg = _make_registry(
            conditions={"Baseline": {0: 80.0}, "Proposed": {0: 90.0}},
        )
        tex = r"""
\section{Results}
Our method achieves 95.5 accuracy.
"""
        result = verify_paper(tex, reg)
        assert result.severity == "REJECT"
        assert result.strict_violations >= 1
        assert any(abs(u.value - 95.5) < 0.01 for u in result.unverified_numbers)

    def test_fabricated_in_table_rejects(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Results}
\begin{table}[h]
\begin{tabular}{lc}
A & 85.3 \\
\end{tabular}
\end{table}
"""
        result = verify_paper(tex, reg)
        assert result.severity == "REJECT"

    def test_fabricated_in_discussion_warns(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Discussion}
Compared to prior work reporting 95.5 accuracy, our result is lower.
"""
        result = verify_paper(tex, reg)
        # In Discussion → warning, not reject
        assert result.severity == "WARN"
        assert result.lenient_violations >= 1

    def test_numbers_in_cite_skipped(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Results}
As shown by \cite{smith2025deep}, our method works.
"""
        result = verify_paper(tex, reg)
        assert result.severity == "PASS"

    def test_numbers_in_comments_skipped(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Results}
% This is a comment with fake number 99.99
Our method achieves 80.0.
"""
        result = verify_paper(tex, reg)
        assert result.severity == "PASS"


# ---------------------------------------------------------------------------
# Unit tests — fabricated conditions
# ---------------------------------------------------------------------------


class TestFabricatedConditions:
    def test_unknown_condition_in_table(self):
        reg = _make_registry(conditions={"DQN": {0: 80.0}, "DQN+Abstraction": {0: 90.0}})
        tex = r"""
\section{Results}
\begin{table}[h]
\begin{tabular}{lc}
DQN & 80.0 \\
DQN+Abstraction & 90.0 \\
PPO & 75.0 \\
\end{tabular}
\end{table}
"""
        result = verify_paper(tex, reg)
        assert len(result.fabricated_conditions) >= 1
        assert any(fc.name == "PPO" for fc in result.fabricated_conditions)
        assert result.severity == "REJECT"


# ---------------------------------------------------------------------------
# Unit tests — fabrication rate
# ---------------------------------------------------------------------------


class TestFabricationRate:
    def test_rate_zero_for_clean_paper(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Results}
Accuracy is 80.0.
"""
        result = verify_paper(tex, reg)
        assert result.fabrication_rate == 0.0

    def test_rate_nonzero_for_fabricated(self):
        reg = _make_registry(conditions={"A": {0: 80.0}})
        tex = r"""
\section{Results}
Accuracy is 99.99 and loss is 45.67.
"""
        result = verify_paper(tex, reg)
        assert result.fabrication_rate > 0.0


# ---------------------------------------------------------------------------
# Integration — real fabricated papers
# ---------------------------------------------------------------------------


class TestRealPapers:
    def _load(self, run_id: str) -> tuple[str, VerifiedRegistry]:
        pattern = f"rc-*-{run_id}"
        matches = sorted(ARTIFACTS.glob(pattern))
        if not matches:
            pytest.skip(f"Artifact {run_id} not found")
        base = matches[0]
        tex_path = base / "stage-22" / "paper.tex"
        summary_path = base / "stage-14" / "experiment_summary.json"
        ref_path = base / "stage-13" / "refinement_log.json"
        if not tex_path.exists() or not summary_path.exists():
            pytest.skip(f"Missing files for {run_id}")
        tex = tex_path.read_text(encoding="utf-8")
        summary = json.loads(summary_path.read_text())
        ref_log = None
        if ref_path.exists():
            ref_log = json.loads(ref_path.read_text())
        reg = VerifiedRegistry.from_experiment(summary, ref_log)
        return tex, reg

    def test_run_e57360_severe_fabrication_detected(self):
        """Run 38 (LACE) — audit found SEVERE fabrication.
        The verifier should REJECT this paper."""
        tex, reg = self._load("e57360")
        result = verify_paper(tex, reg)
        assert result.severity == "REJECT", (
            f"Expected REJECT for severely fabricated paper, got {result.severity}. "
            f"Unverified: {len(result.unverified_numbers)}, "
            f"Fabricated conditions: {[fc.name for fc in result.fabricated_conditions]}"
        )

    def test_run_6a1ec9_severe_fabrication_detected(self):
        """Run 6a1ec9 (FAME) — audit found SEVERE fabrication."""
        tex, reg = self._load("6a1ec9")
        result = verify_paper(tex, reg)
        assert result.severity == "REJECT"

    def test_run_85fefc_fabrication_detected(self):
        """Run 85fefc (CRAFT) — audit found SEVERE fabrication."""
        tex, reg = self._load("85fefc")
        result = verify_paper(tex, reg)
        # Should detect at least some issues
        assert len(result.unverified_numbers) > 0 or len(result.fabricated_conditions) > 0

    def test_run_acbdfa_moderate_fabrication(self):
        """Run acbdfa (CTS) — audit found MODERATE fabrication."""
        tex, reg = self._load("acbdfa")
        result = verify_paper(tex, reg)
        # May or may not reject (moderate case), but should find issues
        assert len(result.unverified_numbers) > 0 or result.lenient_violations > 0


================================================
FILE: tests/test_project_manager.py
================================================
"""Tests for multi-project management (C1): ProjectManager, ProjectScheduler, IdeaPool."""

from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path

import pytest

from researchclaw.project.models import Idea, Project
from researchclaw.project.manager import ProjectManager
from researchclaw.project.scheduler import ProjectScheduler
from researchclaw.project.idea_pool import IdeaPool


# ── fixtures ──────────────────────────────────────────────────────


@pytest.fixture
def tmp_projects(tmp_path: Path) -> Path:
    return tmp_path / "projects"


@pytest.fixture
def manager(tmp_projects: Path) -> ProjectManager:
    return ProjectManager(tmp_projects)


@pytest.fixture
def config_yaml(tmp_path: Path) -> Path:
    cfg = tmp_path / "config.yaml"
    cfg.write_text("project:\n  name: test\nresearch:\n  topic: test\n")
    return cfg


@pytest.fixture
def pool_path(tmp_path: Path) -> Path:
    return tmp_path / "ideas.json"


# ══════════════════════════════════════════════════════════════════
# Project model tests
# ══════════════════════════════════════════════════════════════════


class TestProjectModel:
    def test_to_dict_roundtrip(self) -> None:
        p = Project(name="test", config_path="/a/b", run_dir="/c/d", topic="ml")
        d = p.to_dict()
        p2 = Project.from_dict(d)
        assert p2.name == p.name
        assert p2.topic == p.topic
        assert p2.status == "idle"

    def test_from_dict_defaults(self) -> None:
        d = {"name": "x", "config_path": "/a", "run_dir": "/b"}
        p = Project.from_dict(d)
        assert p.status == "idle"
        assert p.last_run_id is None

    def test_from_dict_with_iso_date(self) -> None:
        d = {
            "name": "x",
            "config_path": "/a",
            "run_dir": "/b",
            "created_at": "2024-01-01T00:00:00+00:00",
        }
        p = Project.from_dict(d)
        assert p.created_at.year == 2024


# ══════════════════════════════════════════════════════════════════
# Idea model tests
# ══════════════════════════════════════════════════════════════════


class TestIdeaModel:
    def test_score_calculation(self) -> None:
        idea = Idea(id="1", title="t", description="d", feasibility=1.0, novelty=1.0)
        assert idea.score == pytest.approx(1.0)

    def test_score_weighted(self) -> None:
        idea = Idea(id="1", title="t", description="d", feasibility=0.5, novelty=0.5)
        assert idea.score == pytest.approx(0.5)

    def test_to_dict_roundtrip(self) -> None:
        idea = Idea(id="abc", title="GNN", description="graph stuff", domains=["ml"])
        d = idea.to_dict()
        i2 = Idea.from_dict(d)
        assert i2.id == "abc"
        assert i2.domains == ["ml"]


# ══════════════════════════════════════════════════════════════════
# ProjectManager tests
# ══════════════════════════════════════════════════════════════════


class TestProjectManager:
    def test_create_project(self, manager: ProjectManager, config_yaml: Path) -> None:
        proj = manager.create("my_project", str(config_yaml), topic="RL")
        assert proj.name == "my_project"
        assert proj.topic == "RL"
        assert proj.status == "idle"

    def test_create_sets_active(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("first", str(config_yaml))
        assert manager.active is not None
        assert manager.active.name == "first"

    def test_create_duplicate_raises(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("dup", str(config_yaml))
        with pytest.raises(ValueError, match="already exists"):
            manager.create("dup", str(config_yaml))

    def test_delete_project(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("del_me", str(config_yaml))
        manager.delete("del_me")
        assert "del_me" not in manager.projects

    def test_delete_unknown_raises(self, manager: ProjectManager) -> None:
        with pytest.raises(KeyError):
            manager.delete("nonexistent")

    def test_get_project(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("proj1", str(config_yaml))
        p = manager.get("proj1")
        assert p.name == "proj1"

    def test_get_unknown_raises(self, manager: ProjectManager) -> None:
        with pytest.raises(KeyError):
            manager.get("nope")

    def test_list_all_sorted(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("b_proj", str(config_yaml))
        manager.create("a_proj", str(config_yaml))
        projects = manager.list_all()
        assert len(projects) == 2
        # Sorted by creation time (b_proj first)
        assert projects[0].name == "b_proj"

    def test_get_status(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("s1", str(config_yaml))
        manager.create("s2", str(config_yaml))
        status = manager.get_status()
        assert status["total"] == 2
        assert status["active"] == "s1"

    def test_switch_project(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("a", str(config_yaml))
        manager.create("b", str(config_yaml))
        manager.switch("b")
        assert manager.active is not None
        assert manager.active.name == "b"

    def test_switch_unknown_raises(self, manager: ProjectManager) -> None:
        with pytest.raises(KeyError):
            manager.switch("ghost")

    def test_compare_projects(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("pa", str(config_yaml))
        manager.create("pb", str(config_yaml))
        manager.projects["pa"].metrics = {"acc": 0.9}
        manager.projects["pb"].metrics = {"acc": 0.95}
        result = manager.compare("pa", "pb")
        assert "metric_diff" in result
        assert result["metric_diff"]["acc"]["delta"] == pytest.approx(0.05)

    def test_start_run(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("run_proj", str(config_yaml))
        rid = manager.start_run("run_proj", run_id="rc-123")
        assert rid == "rc-123"
        assert manager.get("run_proj").status == "running"

    def test_finish_run(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("fin_proj", str(config_yaml))
        manager.start_run("fin_proj", run_id="rc-456")
        manager.finish_run("fin_proj", "completed", {"acc": 0.88})
        p = manager.get("fin_proj")
        assert p.status == "completed"
        assert p.metrics["acc"] == 0.88

    def test_registry_persistence(self, tmp_projects: Path, config_yaml: Path) -> None:
        m1 = ProjectManager(tmp_projects)
        m1.create("persist", str(config_yaml), topic="persistence")
        # Load from disk
        m2 = ProjectManager(tmp_projects)
        assert "persist" in m2.projects
        assert m2.projects["persist"].topic == "persistence"

    def test_delete_switches_active(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("first", str(config_yaml))
        manager.create("second", str(config_yaml))
        manager.switch("first")
        manager.delete("first")
        # Should switch active to remaining project
        assert manager.active is not None
        assert manager.active.name == "second"

    def test_config_copied_to_project_dir(self, manager: ProjectManager, config_yaml: Path) -> None:
        proj = manager.create("copy_test", str(config_yaml))
        copied = Path(proj.config_path)
        assert copied.exists()
        assert "test" in copied.read_text()


# ══════════════════════════════════════════════════════════════════
# ProjectScheduler tests
# ══════════════════════════════════════════════════════════════════


class TestProjectScheduler:
    def test_enqueue_and_next(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("proj", str(config_yaml))
        sched = ProjectScheduler(manager, max_concurrent=1)
        sched.enqueue("proj")
        name = sched.next()
        assert name == "proj"

    def test_concurrency_limit(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("a", str(config_yaml))
        manager.create("b", str(config_yaml))
        sched = ProjectScheduler(manager, max_concurrent=1)
        sched.enqueue("a")
        sched.enqueue("b")
        sched.next()  # starts "a"
        assert sched.next() is None  # can't start "b"

    def test_mark_done_frees_slot(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("a", str(config_yaml))
        manager.create("b", str(config_yaml))
        sched = ProjectScheduler(manager, max_concurrent=1)
        sched.enqueue("a")
        sched.enqueue("b")
        sched.next()  # starts "a"
        sched.mark_done("a")
        name = sched.next()
        assert name == "b"

    def test_priority_order(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("low", str(config_yaml))
        manager.create("high", str(config_yaml))
        sched = ProjectScheduler(manager, max_concurrent=2)
        sched.enqueue("low", priority=10)
        sched.enqueue("high", priority=1)
        # Higher priority (lower number) first
        assert sched.next() == "high"
        assert sched.next() == "low"

    def test_enqueue_unknown_raises(self, manager: ProjectManager) -> None:
        sched = ProjectScheduler(manager)
        with pytest.raises(KeyError):
            sched.enqueue("ghost")

    def test_duplicate_enqueue_ignored(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("dup", str(config_yaml))
        sched = ProjectScheduler(manager)
        sched.enqueue("dup")
        sched.enqueue("dup")
        assert sched.queue_size == 1

    def test_get_status(self, manager: ProjectManager, config_yaml: Path) -> None:
        manager.create("s", str(config_yaml))
        sched = ProjectScheduler(manager, max_concurrent=3)
        sched.enqueue("s")
        status = sched.get_status()
        assert status["max_concurrent"] == 3
        assert status["queue_size"] == 1

    def test_can_start_empty_queue(self, manager: ProjectManager) -> None:
        sched = ProjectScheduler(manager)
        assert not sched.can_start()


# ══════════════════════════════════════════════════════════════════
# IdeaPool tests
# ══════════════════════════════════════════════════════════════════


class TestIdeaPool:
    def test_add_idea(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("GNN for proteins", "Apply GNN to protein folding", ["bio", "ml"])
        assert idea.title == "GNN for proteins"
        assert len(idea.id) == 8

    def test_remove_idea(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("remove me", "desc")
        pool.remove(idea.id)
        assert idea.id not in pool.ideas

    def test_remove_unknown_raises(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        with pytest.raises(KeyError):
            pool.remove("nonexistent")

    def test_get_idea(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("get me", "desc")
        retrieved = pool.get(idea.id)
        assert retrieved.title == "get me"

    def test_evaluate(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("eval", "desc")
        result = pool.evaluate(idea.id, feasibility=0.8, novelty=0.9)
        assert result["feasibility"] == 0.8
        assert result["novelty"] == 0.9
        assert pool.get(idea.id).status == "evaluated"

    def test_evaluate_clamps_values(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("clamp", "desc")
        pool.evaluate(idea.id, feasibility=1.5, novelty=-0.5)
        assert pool.get(idea.id).feasibility == 1.0
        assert pool.get(idea.id).novelty == 0.0

    def test_rank(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        pool.add("low", "desc")
        pool.add("high", "desc")
        pool.evaluate(pool.list_all()[0].id, 0.1, 0.1)
        pool.evaluate(pool.list_all()[1].id, 0.9, 0.9)
        ranked = pool.rank()
        assert ranked[0].score > ranked[1].score

    def test_list_all(self, pool_path: Path) -> None:
        pool = IdeaPool(pool_path)
        pool.add("a", "desc")
        pool.add("b", "desc")
        assert len(pool.list_all()) == 2

    def test_persistence(self, pool_path: Path) -> None:
        pool1 = IdeaPool(pool_path)
        pool1.add("persist", "desc", ["ml"])
        pool2 = IdeaPool(pool_path)
        assert len(pool2.ideas) == 1
        assert list(pool2.ideas.values())[0].title == "persist"

    def test_to_project(self, pool_path: Path, tmp_path: Path, config_yaml: Path) -> None:
        pool = IdeaPool(pool_path)
        idea = pool.add("my idea", "a nice description")
        projects_dir = tmp_path / "projects"
        proj = pool.to_project(idea.id, str(config_yaml), projects_dir)
        assert proj.topic == "a nice description"
        assert pool.get(idea.id).status == "planned"


================================================
FILE: tests/test_prompt_adapter.py
================================================
"""Tests for domain-aware prompt adapters."""

from __future__ import annotations

import pytest

from researchclaw.domains.detector import DomainProfile, get_profile, get_generic_profile
from researchclaw.domains.prompt_adapter import (
    GenericPromptAdapter,
    MLPromptAdapter,
    PromptAdapter,
    PromptBlocks,
    get_adapter,
    register_adapter,
)


# ---------------------------------------------------------------------------
# PromptBlocks tests
# ---------------------------------------------------------------------------


class TestPromptBlocks:
    def test_default_empty(self):
        blocks = PromptBlocks()
        assert blocks.compute_budget == ""
        assert blocks.dataset_guidance == ""
        assert blocks.code_generation_hints == ""

    def test_all_fields(self):
        blocks = PromptBlocks(
            compute_budget="budget info",
            dataset_guidance="data info",
            hp_reporting="hp info",
            code_generation_hints="code hints",
            result_analysis_hints="analysis hints",
            experiment_design_context="design context",
            statistical_test_guidance="stat guidance",
            output_format_guidance="output format",
        )
        assert blocks.compute_budget == "budget info"
        assert blocks.output_format_guidance == "output format"


# ---------------------------------------------------------------------------
# ML Adapter tests
# ---------------------------------------------------------------------------


class TestMLPromptAdapter:
    def test_returns_empty_blocks(self):
        """ML adapter must return empty blocks (delegates to prompts.py)."""
        profile = get_profile("ml_vision") or DomainProfile(
            domain_id="ml_vision", display_name="CV"
        )
        adapter = MLPromptAdapter(profile)

        blocks = adapter.get_code_generation_blocks({})
        assert blocks.compute_budget == ""
        assert blocks.dataset_guidance == ""
        assert blocks.code_generation_hints == ""

    def test_all_methods_return_empty(self):
        profile = DomainProfile(domain_id="ml_generic", display_name="ML")
        adapter = MLPromptAdapter(profile)

        for method in [
            adapter.get_code_generation_blocks,
            adapter.get_experiment_design_blocks,
            adapter.get_result_analysis_blocks,
        ]:
            blocks = method({})
            assert all(
                getattr(blocks, f) == ""
                for f in [
                    "compute_budget", "dataset_guidance", "hp_reporting",
                    "code_generation_hints", "result_analysis_hints",
                ]
            )


# ---------------------------------------------------------------------------
# Generic Adapter tests
# ---------------------------------------------------------------------------


class TestGenericPromptAdapter:
    def test_provides_code_hints(self):
        profile = DomainProfile(
            domain_id="generic",
            display_name="Generic",
            core_libraries=["numpy", "scipy"],
        )
        adapter = GenericPromptAdapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints  # should not be empty

    def test_convergence_hints(self):
        profile = DomainProfile(
            domain_id="test_conv",
            display_name="Conv Test",
            experiment_paradigm="convergence",
        )
        adapter = GenericPromptAdapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert "convergence" in blocks.code_generation_hints.lower()

    def test_progressive_spec_hints(self):
        profile = DomainProfile(
            domain_id="test_econ",
            display_name="Econ Test",
            experiment_paradigm="progressive_spec",
        )
        adapter = GenericPromptAdapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert "progressive" in blocks.code_generation_hints.lower()

    def test_experiment_design_has_terminology(self):
        profile = DomainProfile(
            domain_id="test",
            display_name="Test Domain",
            condition_terminology={"baseline": "reference", "proposed": "our method"},
            standard_baselines=["Method A", "Method B"],
        )
        adapter = GenericPromptAdapter(profile)
        blocks = adapter.get_experiment_design_blocks({})
        assert "reference" in blocks.experiment_design_context
        assert "Method A" in blocks.experiment_design_context


# ---------------------------------------------------------------------------
# Physics Adapter tests
# ---------------------------------------------------------------------------


class TestPhysicsAdapter:
    def test_physics_adapter_loaded(self):
        profile = get_profile("physics_simulation")
        if profile is None:
            pytest.skip("physics_simulation profile not found")
        adapter = get_adapter(profile)
        assert not isinstance(adapter, MLPromptAdapter)

    def test_physics_code_blocks_nonempty(self):
        profile = get_profile("physics_pde")
        if profile is None:
            pytest.skip("physics_pde profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints  # should have physics-specific hints


# ---------------------------------------------------------------------------
# Economics Adapter tests
# ---------------------------------------------------------------------------


class TestEconomicsAdapter:
    def test_economics_adapter_loaded(self):
        profile = get_profile("economics_empirical")
        if profile is None:
            pytest.skip("economics_empirical profile not found")
        adapter = get_adapter(profile)
        assert not isinstance(adapter, MLPromptAdapter)

    def test_economics_design_blocks(self):
        profile = get_profile("economics_empirical")
        if profile is None:
            pytest.skip("economics_empirical profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_experiment_design_blocks({})
        assert "progressive" in blocks.experiment_design_context.lower()


# ---------------------------------------------------------------------------
# get_adapter dispatch tests
# ---------------------------------------------------------------------------


class TestGetAdapter:
    def test_ml_domains_get_ml_adapter(self):
        for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]:
            profile = get_profile(domain_id)
            if profile is None:
                continue
            adapter = get_adapter(profile)
            assert isinstance(adapter, MLPromptAdapter), (
                f"{domain_id} should use MLPromptAdapter"
            )

    def test_generic_domain_gets_generic_adapter(self):
        profile = get_generic_profile()
        adapter = get_adapter(profile)
        assert isinstance(adapter, GenericPromptAdapter)

    def test_physics_uses_physics_adapter(self):
        profile = get_profile("physics_simulation")
        if profile is None:
            pytest.skip("physics_simulation profile not found")
        adapter = get_adapter(profile)
        from researchclaw.domains.adapters.physics import PhysicsPromptAdapter
        assert isinstance(adapter, PhysicsPromptAdapter)

    def test_unknown_domain_gets_generic(self):
        profile = DomainProfile(domain_id="unknown_domain", display_name="Unknown")
        adapter = get_adapter(profile)
        assert isinstance(adapter, GenericPromptAdapter)


# ---------------------------------------------------------------------------
# Blueprint context tests
# ---------------------------------------------------------------------------


class TestBlueprintContext:
    def test_blueprint_includes_file_structure(self):
        profile = DomainProfile(
            domain_id="test",
            display_name="Test",
            typical_file_structure={"config.py": "Config", "main.py": "Entry"},
            core_libraries=["numpy"],
        )
        adapter = GenericPromptAdapter(profile)
        ctx = adapter.get_blueprint_context()
        assert "config.py" in ctx
        assert "numpy" in ctx

    def test_blueprint_includes_hints(self):
        profile = DomainProfile(
            domain_id="test",
            display_name="Test",
            code_generation_hints="Use scipy.integrate for ODE solving",
        )
        adapter = GenericPromptAdapter(profile)
        ctx = adapter.get_blueprint_context()
        assert "scipy.integrate" in ctx

    def test_ml_adapter_blueprint_context(self):
        """ML adapter should also provide basic blueprint context."""
        profile = get_profile("ml_vision") or DomainProfile(
            domain_id="ml_vision",
            display_name="CV",
            typical_file_structure={"model.py": "Model", "train.py": "Training"},
        )
        adapter = MLPromptAdapter(profile)
        ctx = adapter.get_blueprint_context()
        # ML adapter inherits from base, should have file structure if profile has it
        if profile.typical_file_structure:
            assert "model.py" in ctx or ctx == ""  # acceptable either way


# ---------------------------------------------------------------------------
# Adapter registration tests
# ---------------------------------------------------------------------------


class TestAdapterRegistration:
    def test_register_custom_adapter(self):
        class CustomAdapter(PromptAdapter):
            def get_code_generation_blocks(self, ctx):
                return PromptBlocks(code_generation_hints="custom")

            def get_experiment_design_blocks(self, ctx):
                return PromptBlocks()

            def get_result_analysis_blocks(self, ctx):
                return PromptBlocks()

        register_adapter("custom_domain", CustomAdapter)

        profile = DomainProfile(domain_id="custom_domain", display_name="Custom")
        adapter = get_adapter(profile)
        assert isinstance(adapter, CustomAdapter)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints == "custom"


================================================
FILE: tests/test_rc_adapters.py
================================================
from __future__ import annotations

from researchclaw.adapters import (
    AdapterBundle,
    BrowserPage,
    FetchResponse,
    RecordingBrowserAdapter,
    RecordingCronAdapter,
    RecordingMemoryAdapter,
    RecordingMessageAdapter,
    RecordingSessionsAdapter,
    RecordingWebFetchAdapter,
)


def test_adapter_bundle_defaults_are_recording_types():
    bundle = AdapterBundle()
    assert isinstance(bundle.cron, RecordingCronAdapter)
    assert isinstance(bundle.message, RecordingMessageAdapter)
    assert isinstance(bundle.memory, RecordingMemoryAdapter)
    assert isinstance(bundle.sessions, RecordingSessionsAdapter)
    assert isinstance(bundle.web_fetch, RecordingWebFetchAdapter)
    assert isinstance(bundle.browser, RecordingBrowserAdapter)


def test_recording_cron_adapter_records_call_and_returns_id():
    adapter = RecordingCronAdapter()
    result = adapter.schedule_resume("run-1", 7, "gate opened")
    assert result == "cron-1"
    assert adapter.calls == [("run-1", 7, "gate opened")]


def test_recording_message_adapter_notify_records_call():
    adapter = RecordingMessageAdapter()
    result = adapter.notify("ops", "stage update", "stage 3 done")
    assert result == "message-1"
    assert adapter.calls == [("ops", "stage update", "stage 3 done")]


def test_recording_memory_adapter_append_records_entries():
    adapter = RecordingMemoryAdapter()
    result = adapter.append("runs", "run-1 started")
    assert result == "memory-1"
    assert adapter.entries == [("runs", "run-1 started")]


def test_recording_sessions_adapter_spawn_records_calls():
    adapter = RecordingSessionsAdapter()
    result = adapter.spawn("worker", ("python", "train.py"))
    assert result == "session-1"
    assert adapter.calls == [("worker", ("python", "train.py"))]


def test_recording_webfetch_fetch_returns_success_response():
    adapter = RecordingWebFetchAdapter()
    response = adapter.fetch("https://example.com")
    assert isinstance(response, FetchResponse)
    assert response.url == "https://example.com"
    assert response.status_code == 200
    assert "stub fetch" in response.text


def test_recording_browser_open_returns_browser_page():
    adapter = RecordingBrowserAdapter()
    page = adapter.open("https://example.com")
    assert isinstance(page, BrowserPage)
    assert page.url == "https://example.com"
    assert "Stub browser page" in page.title


def test_fetch_response_dataclass_fields():
    response = FetchResponse(url="u", status_code=201, text="ok")
    assert response.url == "u"
    assert response.status_code == 201
    assert response.text == "ok"


def test_browser_page_dataclass_fields():
    page = BrowserPage(url="https://a", title="A")
    assert page.url == "https://a"
    assert page.title == "A"


def test_all_adapters_start_with_empty_call_lists():
    cron = RecordingCronAdapter()
    message = RecordingMessageAdapter()
    memory = RecordingMemoryAdapter()
    sessions = RecordingSessionsAdapter()
    web_fetch = RecordingWebFetchAdapter()
    browser = RecordingBrowserAdapter()
    assert cron.calls == []
    assert message.calls == []
    assert memory.entries == []
    assert sessions.calls == []
    assert web_fetch.calls == []
    assert browser.calls == []


================================================
FILE: tests/test_rc_cache.py
================================================
"""Tests for literature query cache and degradation fallback."""

from __future__ import annotations

import importlib
from unittest.mock import patch

from researchclaw.literature.models import Author, Paper
from researchclaw.literature.search import search_papers

cache_mod = importlib.import_module("researchclaw.literature.cache")
cache_key = cache_mod.cache_key
cache_stats = cache_mod.cache_stats
clear_cache = cache_mod.clear_cache
get_cached = cache_mod.get_cached
put_cache = cache_mod.put_cache


class TestCacheKey:
    def test_deterministic(self, tmp_path):
        _ = tmp_path
        k1 = cache_key("transformer", "s2", 20)
        k2 = cache_key("transformer", "s2", 20)
        assert k1 == k2

    def test_different_query(self):
        k1 = cache_key("transformer", "s2", 20)
        k2 = cache_key("attention", "s2", 20)
        assert k1 != k2

    def test_case_insensitive(self):
        k1 = cache_key("Transformer", "S2", 20)
        k2 = cache_key("transformer", "s2", 20)
        assert k1 == k2

    def test_length_16(self):
        k = cache_key("test", "s2", 10)
        assert len(k) == 16


class TestGetPut:
    def test_put_and_get(self, tmp_path):
        papers = [{"paper_id": "1", "title": "Test Paper"}]
        put_cache("q1", "s2", 20, papers, cache_base=tmp_path)
        result = get_cached("q1", "s2", 20, cache_base=tmp_path)
        assert result is not None
        assert len(result) == 1
        assert result[0]["title"] == "Test Paper"

    def test_cache_miss(self, tmp_path):
        result = get_cached("nonexistent", "s2", 20, cache_base=tmp_path)
        assert result is None

    def test_cache_expired(self, tmp_path):
        papers = [{"paper_id": "1", "title": "Old"}]
        put_cache("q1", "s2", 20, papers, cache_base=tmp_path)
        result = get_cached("q1", "s2", 20, cache_base=tmp_path, ttl=0)
        assert result is None

    def test_cache_not_expired(self, tmp_path):
        papers = [{"paper_id": "1", "title": "Fresh"}]
        put_cache("q1", "s2", 20, papers, cache_base=tmp_path)
        result = get_cached("q1", "s2", 20, cache_base=tmp_path, ttl=9999)
        assert result is not None

    def test_corrupted_cache_returns_none(self, tmp_path):
        key = cache_key("q1", "s2", 20)
        (tmp_path / f"{key}.json").write_text("not json", encoding="utf-8")
        result = get_cached("q1", "s2", 20, cache_base=tmp_path)
        assert result is None


class TestClear:
    def test_clear_removes_all(self, tmp_path):
        put_cache("q1", "s2", 20, [{"id": "1"}], cache_base=tmp_path)
        put_cache("q2", "arxiv", 10, [{"id": "2"}], cache_base=tmp_path)
        count = clear_cache(cache_base=tmp_path)
        assert count == 2
        assert get_cached("q1", "s2", 20, cache_base=tmp_path) is None

    def test_clear_empty(self, tmp_path):
        count = clear_cache(cache_base=tmp_path)
        assert count == 0


class TestStats:
    def test_stats_empty(self, tmp_path):
        stats = cache_stats(cache_base=tmp_path)
        assert stats["entries"] == 0
        assert stats["total_bytes"] == 0

    def test_stats_with_entries(self, tmp_path):
        put_cache("q1", "s2", 20, [{"id": "1"}], cache_base=tmp_path)
        stats = cache_stats(cache_base=tmp_path)
        assert stats["entries"] == 1
        assert stats["total_bytes"] > 0


class TestSearchDegradation:
    def test_search_uses_cache_on_failure(self, tmp_path):
        cached_papers = [
            {
                "paper_id": "s2-123",
                "title": "Cached Paper",
                "authors": [],
                "year": 2024,
                "abstract": "",
                "venue": "",
                "citation_count": 10,
                "doi": "",
                "arxiv_id": "",
                "url": "",
                "source": "semantic_scholar",
            }
        ]
        put_cache(
            "test query",
            "semantic_scholar",
            20,
            cached_papers,
            cache_base=tmp_path,
        )

        with patch(
            "researchclaw.literature.search.search_openalex",
            side_effect=RuntimeError("API down"),
        ):
            with patch(
                "researchclaw.literature.search.search_semantic_scholar",
                side_effect=RuntimeError("API down"),
            ):
                with patch(
                    "researchclaw.literature.search.search_arxiv",
                    side_effect=RuntimeError("API down"),
                ):
                    with patch(
                        "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path
                    ):
                        with patch(
                            "researchclaw.literature.search.time.sleep", lambda _: None
                        ):
                            results = search_papers("test query", limit=20)

        assert len(results) >= 1
        assert results[0].title == "Cached Paper"

    def test_search_caches_successful_results(self, tmp_path):
        mock_paper = Paper(
            paper_id="s2-test",
            title="Test",
            authors=(Author(name="Smith"),),
            year=2024,
            abstract="abs",
            source="semantic_scholar",
        )

        with patch(
            "researchclaw.literature.search.search_semantic_scholar",
            return_value=[mock_paper],
        ):
            with patch("researchclaw.literature.search.search_arxiv", return_value=[]):
                with patch(
                    "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path
                ):
                    with patch(
                        "researchclaw.literature.search.time.sleep", lambda _: None
                    ):
                        _ = search_papers("test", limit=20)

        cached = get_cached("test", "semantic_scholar", 20, cache_base=tmp_path)
        assert cached is not None
        assert cached[0]["paper_id"] == "s2-test"


================================================
FILE: tests/test_rc_checkpoint.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false
"""Tests for checkpoint/resume and content metrics."""

from __future__ import annotations

import json
from pathlib import Path
from typing import cast

from researchclaw.pipeline.executor import StageResult
from researchclaw.pipeline.runner import (
    _build_pipeline_summary,
    _collect_content_metrics,
    _write_checkpoint,
    read_checkpoint,
    resume_from_checkpoint,
)
from researchclaw.pipeline.stages import (
    NONCRITICAL_STAGES,
    STAGE_SEQUENCE,
    Stage,
    StageStatus,
)


class TestCheckpoint:
    def test_write_checkpoint(self, tmp_path: Path):
        _write_checkpoint(tmp_path, Stage.LITERATURE_COLLECT, "test-run")
        cp = json.loads((tmp_path / "checkpoint.json").read_text())
        assert cp["last_completed_stage"] == 4
        assert cp["last_completed_name"] == "LITERATURE_COLLECT"
        assert cp["run_id"] == "test-run"
        assert "timestamp" in cp

    def test_read_checkpoint_returns_next_stage(self, tmp_path: Path):
        _write_checkpoint(tmp_path, Stage.LITERATURE_COLLECT, "test-run")
        next_stage = read_checkpoint(tmp_path)
        assert next_stage == Stage.LITERATURE_SCREEN

    def test_read_checkpoint_no_file(self, tmp_path: Path):
        assert read_checkpoint(tmp_path) is None

    def test_read_checkpoint_last_stage(self, tmp_path: Path):
        _write_checkpoint(tmp_path, Stage.CITATION_VERIFY, "test-run")
        assert read_checkpoint(tmp_path) is None

    def test_read_checkpoint_corrupted(self, tmp_path: Path):
        (tmp_path / "checkpoint.json").write_text("not json", encoding="utf-8")
        assert read_checkpoint(tmp_path) is None

    def test_read_checkpoint_invalid_stage(self, tmp_path: Path):
        (tmp_path / "checkpoint.json").write_text(
            json.dumps({"last_completed_stage": 999}), encoding="utf-8"
        )
        assert read_checkpoint(tmp_path) is None

    def test_resume_from_checkpoint_uses_default(self, tmp_path: Path):
        assert resume_from_checkpoint(tmp_path) == Stage.TOPIC_INIT

    def test_resume_from_checkpoint_uses_next_stage(self, tmp_path: Path):
        _write_checkpoint(tmp_path, Stage.SEARCH_STRATEGY, "run-x")
        assert resume_from_checkpoint(tmp_path) == Stage.LITERATURE_COLLECT


class TestNoncriticalStages:
    def test_knowledge_archive_is_noncritical(self):
        assert Stage.KNOWLEDGE_ARCHIVE in NONCRITICAL_STAGES

    def test_citation_verify_is_critical(self):
        # T3.4: CITATION_VERIFY is now critical — hallucinated refs must block export
        assert Stage.CITATION_VERIFY not in NONCRITICAL_STAGES

    def test_topic_init_is_critical(self):
        assert Stage.TOPIC_INIT not in NONCRITICAL_STAGES

    def test_paper_draft_is_critical(self):
        assert Stage.PAPER_DRAFT not in NONCRITICAL_STAGES

    def test_stage_sequence_still_ends_with_citation_verify(self):
        assert STAGE_SEQUENCE[-1] == Stage.CITATION_VERIFY


class TestContentMetrics:
    def test_metrics_empty_run_dir(self, tmp_path: Path):
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["template_ratio"] is None
        assert metrics["citation_verify_score"] is None
        assert metrics["total_citations"] is None
        assert metrics["degraded_sources"] == []

    def test_metrics_with_draft(self, tmp_path: Path):
        draft_dir = tmp_path / "stage-17"
        draft_dir.mkdir()
        (draft_dir / "paper_draft.md").write_text(
            "This is a real academic paper about transformers and attention mechanisms. We propose a novel method for improving efficiency.",
            encoding="utf-8",
        )
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["template_ratio"] is not None
        assert cast(float, metrics["template_ratio"]) < 0.5

    def test_metrics_with_verification(self, tmp_path: Path):
        verify_dir = tmp_path / "stage-23"
        verify_dir.mkdir()
        (verify_dir / "verification_report.json").write_text(
            json.dumps(
                {
                    "summary": {
                        "total": 10,
                        "verified": 8,
                        "suspicious": 1,
                        "hallucinated": 1,
                        "skipped": 0,
                        "integrity_score": 0.8
                    },
                    "results": []
                }
            ),
            encoding="utf-8",
        )
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["total_citations"] == 10
        assert metrics["verified_citations"] == 8
        assert metrics["citation_verify_score"] == 0.8

    def test_metrics_no_stage23(self, tmp_path: Path):
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["citation_verify_score"] is None

    def test_metrics_with_non_dict_summary(self, tmp_path: Path):
        """Must not raise NameError when 'summary' is not a dict."""
        verify_dir = tmp_path / "stage-23"
        verify_dir.mkdir()
        (verify_dir / "verification_report.json").write_text(
            json.dumps({"summary": "unexpected string"}),
            encoding="utf-8",
        )
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["total_citations"] is None
        assert metrics["verified_citations"] is None
        assert metrics["citation_verify_score"] is None

    def test_metrics_with_summary_missing_fields(self, tmp_path: Path):
        """summary dict without total/verified should not crash."""
        verify_dir = tmp_path / "stage-23"
        verify_dir.mkdir()
        (verify_dir / "verification_report.json").write_text(
            json.dumps({"summary": {"notes": "incomplete"}}),
            encoding="utf-8",
        )
        metrics = _collect_content_metrics(tmp_path)
        assert metrics["total_citations"] == 0
        assert metrics["verified_citations"] == 0
        assert metrics["citation_verify_score"] is None

    def test_summary_includes_content_metrics(self, tmp_path: Path):
        results = [
            StageResult(
                stage=Stage.TOPIC_INIT,
                status=StageStatus.DONE,
                artifacts=("topic.json",),
            ),
        ]
        summary = _build_pipeline_summary(
            run_id="test",
            results=results,
            from_stage=Stage.TOPIC_INIT,
            run_dir=tmp_path,
        )
        assert "content_metrics" in summary
        assert isinstance(summary["content_metrics"], dict)


================================================
FILE: tests/test_rc_citation_resolve.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false
"""Tests for BUG-194: Citation resolver must not replace correct bib entries
with garbage papers from search results.

Tests cover:
  - _resolve_missing_citations: seminal lookup, API validation, rejection of
    unrelated results, year mismatch rejection
  - _load_seminal_papers_by_key: index construction
  - _seminal_to_bibtex: BibTeX generation from YAML entries
"""
from __future__ import annotations

from unittest.mock import patch

import pytest

from researchclaw.literature.models import Author, Paper


# ---------------------------------------------------------------------------
# Helpers to build mock Paper objects
# ---------------------------------------------------------------------------

def _make_paper(
    title: str,
    year: int = 2020,
    authors: list[str] | None = None,
    bibtex_override: str = "",
) -> Paper:
    """Create a Paper with minimal metadata."""
    return Paper(
        paper_id=f"test_{title[:10].replace(' ', '_').lower()}",
        title=title,
        authors=tuple(Author(name=n) for n in (authors or ["Unknown"])),
        year=year,
        source="test",
        _bibtex_override=bibtex_override,
    )


# Patch target for search_papers — the import inside _resolve_missing_citations
# does `from researchclaw.literature.search import search_papers`, so we patch
# the source module.
_SEARCH_PAPERS_PATH = "researchclaw.literature.search.search_papers"


# ---------------------------------------------------------------------------
# Tests for _load_seminal_papers_by_key
# ---------------------------------------------------------------------------

class TestLoadSeminalPapersByKey:
    """Test the seminal papers index builder."""

    def test_loads_well_known_keys(self):
        from researchclaw.pipeline.stage_impls._review_publish import (
            _load_seminal_papers_by_key,
        )
        index = _load_seminal_papers_by_key()
        # The seminal_papers.yaml must contain these foundational papers
        assert "he2016deep" in index
        assert "vaswani2017attention" in index
        assert "srivastava2014dropout" in index

    def test_entries_have_required_fields(self):
        from researchclaw.pipeline.stage_impls._review_publish import (
            _load_seminal_papers_by_key,
        )
        index = _load_seminal_papers_by_key()
        for key, entry in index.items():
            assert "title" in entry, f"Missing title for {key}"
            assert "year" in entry, f"Missing year for {key}"
            assert "authors" in entry, f"Missing authors for {key}"

    def test_graceful_on_load_failure(self):
        """If _load_all raises, _load_seminal_papers_by_key returns {}."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _load_seminal_papers_by_key,
        )
        with patch(
            "researchclaw.data._load_all",
            side_effect=RuntimeError("disk error"),
        ):
            result = _load_seminal_papers_by_key()
            assert result == {}


# ---------------------------------------------------------------------------
# Tests for _seminal_to_bibtex
# ---------------------------------------------------------------------------

class TestSeminalToBibtex:
    """Test BibTeX generation from seminal_papers.yaml entries."""

    def test_conference_paper(self):
        from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex
        entry = {
            "title": "Deep Residual Learning for Image Recognition",
            "authors": "He et al.",
            "year": 2016,
            "venue": "CVPR",
        }
        bib = _seminal_to_bibtex(entry, "he2016deep")
        assert "@inproceedings{he2016deep," in bib
        assert "Deep Residual Learning" in bib
        assert "He et al." in bib
        assert "2016" in bib
        assert "booktitle = {CVPR}" in bib

    def test_journal_paper(self):
        from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex
        entry = {
            "title": "Dropout: A Simple Way to Prevent Neural Networks from Overfitting",
            "authors": "Srivastava et al.",
            "year": 2014,
            "venue": "JMLR",
        }
        bib = _seminal_to_bibtex(entry, "srivastava2014dropout")
        assert "@article{srivastava2014dropout," in bib
        assert "Dropout" in bib
        assert "journal = {JMLR}" in bib

    def test_neurips_is_conference(self):
        from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex
        entry = {
            "title": "Attention Is All You Need",
            "authors": "Vaswani et al.",
            "year": 2017,
            "venue": "NeurIPS",
        }
        bib = _seminal_to_bibtex(entry, "vaswani2017attention")
        assert "@inproceedings{vaswani2017attention," in bib


# ---------------------------------------------------------------------------
# Tests for _resolve_missing_citations
# ---------------------------------------------------------------------------

class TestResolveMissingCitations:
    """Test the full resolution pipeline with BUG-194 fixes."""

    def test_seminal_papers_resolved_without_api(self):
        """Foundational papers should be resolved from seminal_papers.yaml
        without any API calls."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"he2016deep", "vaswani2017attention", "srivastava2014dropout"}
        existing_bib = ""

        # Patch search_papers so it FAILS if called — seminal papers shouldn't
        # need it.
        with patch(
            _SEARCH_PAPERS_PATH,
            side_effect=AssertionError("Should not be called for seminal papers"),
        ):
            resolved, entries = _resolve_missing_citations(missing, existing_bib)

        assert "he2016deep" in resolved
        assert "vaswani2017attention" in resolved
        assert "srivastava2014dropout" in resolved
        assert len(entries) == 3
        # Verify the BibTeX entries contain correct titles
        combined = "\n".join(entries)
        assert "Deep Residual Learning" in combined
        assert "Attention Is All You Need" in combined
        assert "Dropout" in combined

    def test_seminal_papers_not_duplicated_in_existing_bib(self):
        """If the key is already in existing_bib, don't add it again."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        existing_bib = "@article{he2016deep, title={Deep Residual Learning}}"
        missing = {"he2016deep"}
        # Mock search_papers to ensure no real API calls (key should be skipped
        # entirely since it's already in existing_bib).
        with patch(
            _SEARCH_PAPERS_PATH,
            side_effect=AssertionError("Should not call API for key in existing_bib"),
        ):
            resolved, entries = _resolve_missing_citations(missing, existing_bib)
        assert "he2016deep" not in resolved
        assert len(entries) == 0

    def test_garbage_results_rejected_by_similarity(self):
        """BUG-194 regression: unrelated search results must be rejected."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        # Mock a garbage result that has the right year but wrong title
        garbage_paper = _make_paper(
            title="Jokowi and the New Developmentalism",
            year=2016,
            authors=["He, Some Politician"],
            bibtex_override=(
                "@article{jokowi2016,\n"
                "  title = {Jokowi and the New Developmentalism},\n"
                "  author = {He, Some Politician},\n"
                "  year = {2016},\n"
                "}"
            ),
        )

        # This key is NOT in seminal_papers.yaml
        missing = {"smith2016novel"}

        with patch(_SEARCH_PAPERS_PATH, return_value=[garbage_paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        # The garbage result should be rejected (no overlap with "smith novel")
        assert "smith2016novel" not in resolved
        assert len(entries) == 0

    def test_year_mismatch_rejected(self):
        """Results with year > 1 year off from cite key are rejected."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        wrong_year_paper = _make_paper(
            title="Novel Deep Learning Approach by Smith",
            year=2020,  # cite key says 2016
            authors=["Smith, John"],
            bibtex_override=(
                "@article{smith2020,\n"
                "  title = {Novel Deep Learning Approach by Smith},\n"
                "  author = {Smith, John},\n"
                "  year = {2020},\n"
                "}"
            ),
        )

        missing = {"smith2016novel"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[wrong_year_paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        assert "smith2016novel" not in resolved

    def test_good_api_result_accepted(self):
        """A search result with matching author + title words should be accepted."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        good_paper = _make_paper(
            title="Novel Approach to Feature Extraction in Deep Networks",
            year=2018,
            authors=["Chen, Wei"],
            bibtex_override=(
                "@article{chen2018something,\n"
                "  title = {Novel Approach to Feature Extraction in Deep Networks},\n"
                "  author = {Chen, Wei},\n"
                "  year = {2018},\n"
                "}"
            ),
        )

        # cite key: chen2018novel — "chen" matches author, "novel" matches title
        missing = {"chen2018novel"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[good_paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        assert "chen2018novel" in resolved
        assert len(entries) == 1
        # The bib entry should use the original cite_key
        assert "chen2018novel" in entries[0]

    def test_empty_missing_keys_returns_empty(self):
        """No keys to resolve -> empty results."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        resolved, entries = _resolve_missing_citations(set(), "")
        assert len(resolved) == 0
        assert len(entries) == 0

    def test_unparseable_keys_skipped(self):
        """Keys that don't match author-year pattern are skipped."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"notyearkey", "abc"}
        resolved, entries = _resolve_missing_citations(missing, "")
        assert len(resolved) == 0
        assert len(entries) == 0

    def test_import_failure_returns_seminal_only(self):
        """If search_papers can't be imported, seminal results still returned."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        # Mix of seminal and non-seminal keys
        missing = {"he2016deep", "unknownauthor2020something"}
        with patch(
            _SEARCH_PAPERS_PATH,
            side_effect=ImportError("mocked"),
        ):
            resolved, entries = _resolve_missing_citations(missing, "")

        # he2016deep should be resolved from seminal
        assert "he2016deep" in resolved
        # unknownauthor2020something would need API which fails
        assert "unknownauthor2020something" not in resolved

    def test_search_exception_handled_gracefully(self):
        """If search_papers raises, the key is skipped (no crash)."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"unknownauthor2020something"}
        with patch(
            _SEARCH_PAPERS_PATH,
            side_effect=RuntimeError("API down"),
        ):
            resolved, entries = _resolve_missing_citations(missing, "")

        assert len(resolved) == 0

    def test_bug194_he2016deep_not_replaced_with_jokowi(self):
        """BUG-194 exact regression: he2016deep must NEVER resolve to
        'Jokowi and the New Developmentalism'."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        # he2016deep IS in seminal_papers.yaml, so it should resolve from there
        missing = {"he2016deep"}
        resolved, entries = _resolve_missing_citations(missing, "")

        assert "he2016deep" in resolved
        assert len(entries) == 1
        assert "Jokowi" not in entries[0]
        assert "Deep Residual Learning" in entries[0]

    def test_bug194_vaswani2017attention_not_replaced_with_health_supplement(self):
        """BUG-194 exact regression: vaswani2017attention must resolve to
        'Attention Is All You Need', not health supplement garbage."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"vaswani2017attention"}
        resolved, entries = _resolve_missing_citations(missing, "")

        assert "vaswani2017attention" in resolved
        assert len(entries) == 1
        assert "Health Supplement" not in entries[0]
        assert "Attention Is All You Need" in entries[0]

    def test_bug194_srivastava2014dropout_not_replaced_with_cnn_sentence(self):
        """BUG-194 exact regression: srivastava2014dropout must resolve to
        Dropout paper, not CNN for Sentence Classification."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"srivastava2014dropout"}
        resolved, entries = _resolve_missing_citations(missing, "")

        assert "srivastava2014dropout" in resolved
        assert len(entries) == 1
        assert "Sentence Classification" not in entries[0]
        assert "Dropout" in entries[0]

    def test_multiple_seminal_and_api_mixed(self):
        """Mix of seminal keys (resolved locally) and API keys."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )

        api_paper = _make_paper(
            title="Adaptive Learning Rate Methods for Deep Networks",
            year=2019,
            authors=["Zhang, Adaptive"],
            bibtex_override=(
                "@article{zhang2019something,\n"
                "  title = {Adaptive Learning Rate Methods for Deep Networks},\n"
                "  author = {Zhang, Adaptive},\n"
                "  year = {2019},\n"
                "}"
            ),
        )

        missing = {"he2016deep", "zhang2019adaptive"}

        with patch(_SEARCH_PAPERS_PATH, return_value=[api_paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        # he2016deep from seminal, zhang2019adaptive from API
        assert "he2016deep" in resolved
        assert "zhang2019adaptive" in resolved
        assert len(entries) == 2

    def test_no_results_from_api_skips(self):
        """If API returns empty list, key is skipped (not crashed)."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        missing = {"unknownauthor2020something"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[]):
            resolved, entries = _resolve_missing_citations(missing, "")

        assert len(resolved) == 0
        assert len(entries) == 0

    def test_close_year_accepted(self):
        """A result with year within 1 of the cite key year should be accepted
        (arXiv vs conference year difference)."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        paper = _make_paper(
            title="Novel Deep Feature Extraction by Li",
            year=2019,  # cite key says 2018, but 1 year off is OK
            authors=["Li, Novel"],
            bibtex_override=(
                "@article{li2019,\n"
                "  title = {Novel Deep Feature Extraction by Li},\n"
                "  author = {Li, Novel},\n"
                "  year = {2019},\n"
                "}"
            ),
        )

        missing = {"li2018novel"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        # Year 2019 vs 2018 — diff=1, should be accepted since title matches
        assert "li2018novel" in resolved

    def test_completely_unrelated_title_rejected(self):
        """Even if year and author name match, completely unrelated title
        must be rejected."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        paper = _make_paper(
            title="AI-Assisted Pipeline for Dynamic Generation of Trustworthy Health Supplement Content at Scale",
            year=2017,
            authors=["Vaswani, Raj"],
            bibtex_override=(
                "@article{vaswani2017health,\n"
                "  title = {AI-Assisted Pipeline for Dynamic Generation of Trustworthy Health Supplement Content at Scale},\n"
                "  author = {Vaswani, Raj},\n"
                "  year = {2017},\n"
                "}"
            ),
        )

        # Not in seminal_papers.yaml (different key)
        missing = {"vaswani2017health"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        # "health" matches but the overall overlap with query words
        # ["vaswani", "health"] should be evaluated. "vaswani" is in author
        # and "health" is in title, so it may pass. But this tests the
        # validation path at least works.
        # The key point: the search is called only for non-seminal keys.

    def test_picks_best_result_from_multiple(self):
        """When API returns multiple results, the one with best overlap wins."""
        from researchclaw.pipeline.stage_impls._review_publish import (
            _resolve_missing_citations,
        )
        bad_paper = _make_paper(
            title="Convolutional Neural Networks for Sentence Classification",
            year=2018,
            authors=["Kim, Yoon"],
        )
        good_paper = _make_paper(
            title="Feature Extraction via Progressive Learning",
            year=2018,
            authors=["Wang, Feature"],
            bibtex_override=(
                "@article{wang2018,\n"
                "  title = {Feature Extraction via Progressive Learning},\n"
                "  author = {Wang, Feature},\n"
                "  year = {2018},\n"
                "}"
            ),
        )

        missing = {"wang2018feature"}
        with patch(_SEARCH_PAPERS_PATH, return_value=[bad_paper, good_paper]):
            resolved, entries = _resolve_missing_citations(missing, "")

        if resolved:
            # If resolved, it should be the good paper, not the bad one
            assert "Sentence Classification" not in entries[0]


================================================
FILE: tests/test_rc_citation_verify.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false
from __future__ import annotations

import json
import textwrap
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.literature.verify import (
    CitationResult,
    VerificationReport,
    VerifyStatus,
    annotate_paper_hallucinations,
    filter_verified_bibtex,
    parse_bibtex_entries,
    title_similarity,
    verify_by_arxiv_id,
    verify_by_doi,
    verify_by_title_search,
    verify_citations,
)
from researchclaw.literature.models import Author, Paper


SAMPLE_BIB = textwrap.dedent("""\
    @article{vaswani2017attention,
      title = {Attention Is All You Need},
      author = {Ashish Vaswani and Noam Shazeer},
      year = {2017},
      eprint = {1706.03762},
      archiveprefix = {arXiv},
    }

    @inproceedings{devlin2019bert,
      title = {BERT: Pre-training of Deep Bidirectional Transformers},
      author = {Jacob Devlin},
      year = {2019},
      doi = {10.18653/v1/N19-1423},
      booktitle = {NAACL},
    }

    @article{fakepaper2025hallucinated,
      title = {A Completely Made Up Paper That Does Not Exist},
      author = {Imaginary Author},
      year = {2025},
    }
""")

SAMPLE_ARXIV_VERIFY_RESPONSE = textwrap.dedent("""\
    <?xml version="1.0" encoding="UTF-8"?>
    <feed xmlns="http://www.w3.org/2005/Atom">
      <entry>
        <id>http://arxiv.org/abs/1706.03762v5</id>
        <title>Attention Is All You Need</title>
        <summary>The dominant sequence transduction models...</summary>
        <author><name>Ashish Vaswani</name></author>
      </entry>
    </feed>
""")

SAMPLE_ARXIV_EMPTY_RESPONSE = textwrap.dedent("""\
    <?xml version="1.0" encoding="UTF-8"?>
    <feed xmlns="http://www.w3.org/2005/Atom">
      <entry>
        <id>http://arxiv.org/api/errors#incorrect_id_format_for_9999.99999</id>
        <title>Error</title>
        <summary>incorrect id format for 9999.99999</summary>
      </entry>
    </feed>
""")

SAMPLE_CROSSREF_RESPONSE = {
    "status": "ok",
    "message": {
        "DOI": "10.18653/v1/N19-1423",
        "title": [
            "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
        ],
        "author": [{"given": "Jacob", "family": "Devlin"}],
    },
}


class TestParseBibtexEntries:
    def test_parses_three_entries(self) -> None:
        entries = parse_bibtex_entries(SAMPLE_BIB)
        assert len(entries) == 3

    def test_entry_keys(self) -> None:
        entries = parse_bibtex_entries(SAMPLE_BIB)
        keys = [e["key"] for e in entries]
        assert "vaswani2017attention" in keys
        assert "devlin2019bert" in keys
        assert "fakepaper2025hallucinated" in keys

    def test_entry_fields(self) -> None:
        entries = parse_bibtex_entries(SAMPLE_BIB)
        vaswani = next(e for e in entries if e["key"] == "vaswani2017attention")
        assert vaswani["title"] == "Attention Is All You Need"
        assert vaswani["eprint"] == "1706.03762"
        assert vaswani["type"] == "article"

    def test_entry_type(self) -> None:
        entries = parse_bibtex_entries(SAMPLE_BIB)
        devlin = next(e for e in entries if e["key"] == "devlin2019bert")
        assert devlin["type"] == "inproceedings"
        assert devlin["doi"] == "10.18653/v1/N19-1423"

    def test_empty_bib(self) -> None:
        assert parse_bibtex_entries("") == []

    def test_malformed_bib(self) -> None:
        assert parse_bibtex_entries("not bibtex at all") == []


class TestTitleSimilarity:
    def test_identical(self) -> None:
        assert (
            title_similarity("Attention Is All You Need", "Attention Is All You Need")
            == 1.0
        )

    def test_case_insensitive(self) -> None:
        assert (
            title_similarity("attention is all you need", "ATTENTION IS ALL YOU NEED")
            == 1.0
        )

    def test_high_similarity(self) -> None:
        sim = title_similarity(
            "Attention Is All You Need",
            "Attention Is All You Need: A Transformer Architecture",
        )
        assert sim >= 0.5

    def test_low_similarity(self) -> None:
        sim = title_similarity(
            "Attention Is All You Need",
            "Protein Folding with AlphaFold",
        )
        assert sim < 0.3

    def test_empty_strings(self) -> None:
        assert title_similarity("", "") == 0.0
        assert title_similarity("something", "") == 0.0


class TestVerifyByArxivId:
    def test_verified_match(self) -> None:
        mock_resp = MagicMock()
        mock_resp.read.return_value = SAMPLE_ARXIV_VERIFY_RESPONSE.encode("utf-8")
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("urllib.request.urlopen", return_value=mock_resp):
            result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need")

        assert result is not None
        assert result.status == VerifyStatus.VERIFIED
        assert result.method == "arxiv_id"
        assert result.confidence >= 0.80

    def test_hallucinated_error_response(self) -> None:
        mock_resp = MagicMock()
        mock_resp.read.return_value = SAMPLE_ARXIV_EMPTY_RESPONSE.encode("utf-8")
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("urllib.request.urlopen", return_value=mock_resp):
            result = verify_by_arxiv_id("9999.99999", "Fake Paper")

        assert result is not None
        assert result.status == VerifyStatus.HALLUCINATED

    def test_network_failure_returns_none(self) -> None:
        with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
            result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need")
        assert result is None

    def test_title_mismatch_suspicious(self) -> None:
        different_title_response = textwrap.dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <feed xmlns="http://www.w3.org/2005/Atom">
              <entry>
                <id>http://arxiv.org/abs/1706.03762v5</id>
                <title>A Completely Different Paper Title About Quantum Computing</title>
                <summary>Summary</summary>
              </entry>
            </feed>
        """)
        mock_resp = MagicMock()
        mock_resp.read.return_value = different_title_response.encode("utf-8")
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("urllib.request.urlopen", return_value=mock_resp):
            result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need")

        assert result is not None
        assert result.status == VerifyStatus.SUSPICIOUS


class TestVerifyByDoi:
    def test_verified_crossref(self) -> None:
        mock_resp = MagicMock()
        mock_resp.read.return_value = json.dumps(SAMPLE_CROSSREF_RESPONSE).encode(
            "utf-8"
        )
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("urllib.request.urlopen", return_value=mock_resp):
            result = verify_by_doi(
                "10.18653/v1/N19-1423",
                "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
            )

        assert result is not None
        assert result.status == VerifyStatus.VERIFIED
        assert result.method == "doi"

    def test_doi_404_hallucinated(self) -> None:
        import urllib.error

        with patch(
            "urllib.request.urlopen",
            side_effect=urllib.error.HTTPError(
                "https://api.crossref.org/works/10.fake/doi",
                404,
                "Not Found",
                {},
                None,  # type: ignore[arg-type]
            ),
        ):
            result = verify_by_doi("10.fake/doi", "Nonexistent Paper")

        assert result is not None
        assert result.status == VerifyStatus.HALLUCINATED

    def test_network_error_returns_none(self) -> None:
        with patch("urllib.request.urlopen", side_effect=OSError("timeout")):
            result = verify_by_doi("10.1234/test", "Test Paper")
        assert result is None

    def test_doi_exists_no_title(self) -> None:
        no_title_resp = {"status": "ok", "message": {"DOI": "10.1234/test"}}
        mock_resp = MagicMock()
        mock_resp.read.return_value = json.dumps(no_title_resp).encode("utf-8")
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("urllib.request.urlopen", return_value=mock_resp):
            result = verify_by_doi("10.1234/test", "Some Paper")

        assert result is not None
        assert result.status == VerifyStatus.VERIFIED
        assert "no title comparison" in result.details.lower()


class TestVerifyByTitleSearch:
    def test_verified_via_search(self) -> None:
        mock_paper = Paper(
            paper_id="s2-abc",
            title="Attention Is All You Need",
            authors=(Author(name="Vaswani"),),
            year=2017,
            source="semantic_scholar",
        )
        with patch(
            "researchclaw.literature.search.search_papers",
            return_value=[mock_paper],
        ):
            result = verify_by_title_search("Attention Is All You Need")

        assert result is not None
        assert result.status == VerifyStatus.VERIFIED
        assert result.matched_paper is not None

    def test_no_results_hallucinated(self) -> None:
        with patch("researchclaw.literature.search.search_papers", return_value=[]):
            result = verify_by_title_search("A Completely Made Up Paper")

        assert result is not None
        assert result.status == VerifyStatus.HALLUCINATED

    def test_weak_match_hallucinated(self) -> None:
        mock_paper = Paper(
            paper_id="s2-xyz",
            title="Quantum Computing for Protein Folding",
            year=2023,
            source="arxiv",
        )
        with patch(
            "researchclaw.literature.search.search_papers",
            return_value=[mock_paper],
        ):
            result = verify_by_title_search("A Completely Made Up Paper About Nothing")

        assert result is not None
        assert result.status == VerifyStatus.HALLUCINATED

    def test_partial_match_suspicious(self) -> None:
        mock_paper = Paper(
            paper_id="s2-partial",
            title="Attention Mechanisms in Neural Networks",
            year=2019,
            source="semantic_scholar",
        )
        with patch(
            "researchclaw.literature.search.search_papers",
            return_value=[mock_paper],
        ):
            result = verify_by_title_search("Attention Neural Networks Survey Overview")

        assert result is not None
        assert result.status in (VerifyStatus.SUSPICIOUS, VerifyStatus.HALLUCINATED)

    def test_network_failure_returns_none(self) -> None:
        with patch(
            "researchclaw.literature.search.search_papers",
            side_effect=OSError("network down"),
        ):
            result = verify_by_title_search("Any Paper")
        assert result is None


class TestVerifyCitations:
    def test_full_pipeline_mocked(self) -> None:
        arxiv_resp = MagicMock()
        arxiv_resp.read.return_value = SAMPLE_ARXIV_VERIFY_RESPONSE.encode("utf-8")
        arxiv_resp.__enter__ = lambda s: s
        arxiv_resp.__exit__ = MagicMock(return_value=False)

        crossref_resp = MagicMock()
        crossref_resp.read.return_value = json.dumps(SAMPLE_CROSSREF_RESPONSE).encode(
            "utf-8"
        )
        crossref_resp.__enter__ = lambda s: s
        crossref_resp.__exit__ = MagicMock(return_value=False)

        call_count = {"n": 0}

        def mock_urlopen(req: Any, **kwargs: Any) -> MagicMock:
            call_count["n"] += 1
            url = req.full_url if hasattr(req, "full_url") else str(req)
            if "arxiv.org" in url:
                return arxiv_resp
            if "crossref.org" in url:
                return crossref_resp
            raise OSError("unexpected URL")

        with (
            patch("researchclaw.literature.verify.time.sleep"),
            patch("urllib.request.urlopen", side_effect=mock_urlopen),
            patch("researchclaw.literature.search.search_papers", return_value=[]),
        ):
            report = verify_citations(SAMPLE_BIB, inter_verify_delay=0)

        assert report.total == 3
        assert report.verified >= 1
        assert report.hallucinated >= 1

        report_dict = report.to_dict()
        assert "summary" in report_dict
        assert "results" in report_dict
        assert report_dict["summary"]["total"] == 3

    def test_empty_bib(self) -> None:
        report = verify_citations("")
        assert report.total == 0
        assert report.integrity_score == 1.0

    def test_no_title_entry_skipped(self) -> None:
        bib = textwrap.dedent("""\
            @article{noauthor2025,
              author = {Some Author},
              year = {2025},
            }
        """)
        report = verify_citations(bib)
        assert report.total == 1
        assert report.skipped == 1


class TestVerificationReport:
    def test_integrity_score(self) -> None:
        report = VerificationReport(
            total=10, verified=7, suspicious=1, hallucinated=2, skipped=0
        )
        assert report.integrity_score == 0.7

    def test_integrity_score_with_skips(self) -> None:
        report = VerificationReport(
            total=10, verified=6, suspicious=0, hallucinated=2, skipped=2
        )
        assert report.integrity_score == 0.75

    def test_integrity_score_all_skipped(self) -> None:
        report = VerificationReport(
            total=3, verified=0, suspicious=0, hallucinated=0, skipped=3
        )
        assert report.integrity_score == 1.0

    def test_to_dict(self) -> None:
        report = VerificationReport(total=2, verified=1, hallucinated=1)
        d = report.to_dict()
        assert d["summary"]["total"] == 2
        assert d["summary"]["integrity_score"] == 0.5


class TestFilterVerifiedBibtex:
    def _make_report(self) -> VerificationReport:
        return VerificationReport(
            total=3,
            verified=1,
            suspicious=1,
            hallucinated=1,
            results=[
                CitationResult(
                    cite_key="vaswani2017attention",
                    title="Attention Is All You Need",
                    status=VerifyStatus.VERIFIED,
                    confidence=1.0,
                    method="arxiv_id",
                ),
                CitationResult(
                    cite_key="devlin2019bert",
                    title="BERT",
                    status=VerifyStatus.SUSPICIOUS,
                    confidence=0.6,
                    method="doi",
                ),
                CitationResult(
                    cite_key="fakepaper2025hallucinated",
                    title="Fake Paper",
                    status=VerifyStatus.HALLUCINATED,
                    confidence=0.9,
                    method="title_search",
                ),
            ],
        )

    def test_includes_verified_and_suspicious(self) -> None:
        report = self._make_report()
        filtered = filter_verified_bibtex(SAMPLE_BIB, report, include_suspicious=True)
        assert "vaswani2017attention" in filtered
        assert "devlin2019bert" in filtered
        assert "fakepaper2025hallucinated" not in filtered

    def test_excludes_suspicious(self) -> None:
        report = self._make_report()
        filtered = filter_verified_bibtex(SAMPLE_BIB, report, include_suspicious=False)
        assert "vaswani2017attention" in filtered
        assert "devlin2019bert" not in filtered
        assert "fakepaper2025hallucinated" not in filtered

    def test_empty_bib(self) -> None:
        report = VerificationReport()
        assert filter_verified_bibtex("", report) == ""


class TestAnnotatePaperHallucinations:
    def test_latex_citations(self) -> None:
        paper = r"As shown in \cite{vaswani2017attention} and \cite{fakepaper2025hallucinated}."
        report = VerificationReport(
            results=[
                CitationResult(
                    cite_key="vaswani2017attention",
                    title="",
                    status=VerifyStatus.VERIFIED,
                    confidence=1.0,
                    method="arxiv_id",
                ),
                CitationResult(
                    cite_key="fakepaper2025hallucinated",
                    title="",
                    status=VerifyStatus.HALLUCINATED,
                    confidence=0.9,
                    method="title_search",
                ),
            ],
        )
        result = annotate_paper_hallucinations(paper, report)
        assert r"\cite{vaswani2017attention}" in result
        # Hallucinated citations are removed, not annotated
        assert "fakepaper2025hallucinated" not in result

    def test_markdown_citations(self) -> None:
        paper = "As shown in [vaswani2017attention] and [fakepaper2025hallucinated]."
        report = VerificationReport(
            results=[
                CitationResult(
                    cite_key="vaswani2017attention",
                    title="",
                    status=VerifyStatus.VERIFIED,
                    confidence=1.0,
                    method="arxiv_id",
                ),
                CitationResult(
                    cite_key="fakepaper2025hallucinated",
                    title="",
                    status=VerifyStatus.HALLUCINATED,
                    confidence=0.9,
                    method="title_search",
                ),
            ],
        )
        result = annotate_paper_hallucinations(paper, report)
        assert "[vaswani2017attention]" in result
        # Hallucinated citations are removed, not annotated
        assert "fakepaper2025hallucinated" not in result

    def test_suspicious_annotation(self) -> None:
        """Suspicious citations are left unchanged (not removed)."""
        paper = r"\cite{devlin2019bert}"
        report = VerificationReport(
            results=[
                CitationResult(
                    cite_key="devlin2019bert",
                    title="",
                    status=VerifyStatus.SUSPICIOUS,
                    confidence=0.6,
                    method="doi",
                ),
            ],
        )
        result = annotate_paper_hallucinations(paper, report)
        assert r"\cite{devlin2019bert}" in result

    def test_no_modifications_all_verified(self) -> None:
        paper = r"See \cite{vaswani2017attention}."
        report = VerificationReport(
            results=[
                CitationResult(
                    cite_key="vaswani2017attention",
                    title="",
                    status=VerifyStatus.VERIFIED,
                    confidence=1.0,
                    method="arxiv_id",
                ),
            ],
        )
        result = annotate_paper_hallucinations(paper, report)
        assert result == paper


class TestCitationResultSerialization:
    def test_to_dict_basic(self) -> None:
        result = CitationResult(
            cite_key="smith2024test",
            title="Test Paper",
            status=VerifyStatus.VERIFIED,
            confidence=0.95,
            method="arxiv_id",
            details="Confirmed",
        )
        d = result.to_dict()
        assert d["cite_key"] == "smith2024test"
        assert d["status"] == "verified"
        assert d["confidence"] == 0.95

    def test_to_dict_with_matched_paper(self) -> None:
        paper = Paper(
            paper_id="s2-abc",
            title="Found Paper",
            authors=(Author(name="Smith"),),
            year=2024,
            source="semantic_scholar",
        )
        result = CitationResult(
            cite_key="smith2024test",
            title="Test",
            status=VerifyStatus.VERIFIED,
            confidence=0.9,
            method="title_search",
            matched_paper=paper,
        )
        d = result.to_dict()
        assert "matched_paper" in d
        assert d["matched_paper"]["title"] == "Found Paper"


class TestStage23Integration:
    def test_stage_exists_in_enum(self) -> None:
        from researchclaw.pipeline.stages import Stage

        assert hasattr(Stage, "CITATION_VERIFY")
        assert Stage.CITATION_VERIFY == 23

    def test_stage_in_sequence(self) -> None:
        from researchclaw.pipeline.stages import Stage, STAGE_SEQUENCE, NEXT_STAGE

        assert Stage.CITATION_VERIFY in STAGE_SEQUENCE
        assert NEXT_STAGE[Stage.EXPORT_PUBLISH] == Stage.CITATION_VERIFY
        assert NEXT_STAGE[Stage.CITATION_VERIFY] is None

    def test_contract_exists(self) -> None:
        from researchclaw.pipeline.contracts import CONTRACTS
        from researchclaw.pipeline.stages import Stage

        assert Stage.CITATION_VERIFY in CONTRACTS
        contract = CONTRACTS[Stage.CITATION_VERIFY]
        assert "verification_report.json" in contract.output_files
        assert "references_verified.bib" in contract.output_files

    def test_executor_registered(self) -> None:
        from researchclaw.pipeline.executor import _STAGE_EXECUTORS
        from researchclaw.pipeline.stages import Stage

        assert Stage.CITATION_VERIFY in _STAGE_EXECUTORS

    def test_phase_map(self) -> None:
        from researchclaw.pipeline.stages import PHASE_MAP, Stage

        finalization_stages = PHASE_MAP["H: Finalization"]
        assert Stage.CITATION_VERIFY in finalization_stages

    def test_total_stages_is_23(self) -> None:
        from researchclaw.pipeline.stages import STAGE_SEQUENCE

        assert len(STAGE_SEQUENCE) == 23


================================================
FILE: tests/test_rc_cli.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false
from __future__ import annotations

import argparse
import re
from pathlib import Path

import pytest

from researchclaw import cli as rc_cli
from researchclaw.config import resolve_config_path


def _write_valid_config(path: Path) -> None:
    path.write_text(
        """
project:
  name: demo
  mode: docs-first
research:
  topic: Synthetic benchmark research
runtime:
  timezone: UTC
notifications:
  channel: test
knowledge_base:
  backend: markdown
  root: kb
openclaw_bridge: {}
llm:
  provider: openai-compatible
  base_url: http://localhost:1234/v1
  api_key_env: TEST_KEY
""".strip()
        + "\n",
        encoding="utf-8",
    )


def test_main_with_no_args_returns_zero_and_prints_help(
    capsys: pytest.CaptureFixture[str],
) -> None:
    code = rc_cli.main([])
    assert code == 0
    captured = capsys.readouterr()
    assert "ResearchClaw" in captured.out
    assert "usage:" in captured.out


@pytest.mark.parametrize("argv", [["run", "--help"], ["validate", "--help"]])
def test_help_subcommands_exit_zero(argv: list[str]) -> None:
    with pytest.raises(SystemExit) as exc_info:
        rc_cli.main(argv)
    assert exc_info.value.code == 0


def test_generate_run_id_format() -> None:
    run_id = rc_cli._generate_run_id("my topic")
    assert run_id.startswith("rc-")
    assert re.fullmatch(r"rc-\d{8}-\d{6}-[0-9a-f]{6}", run_id)


def test_cmd_run_missing_config_returns_one(
    tmp_path: Path, capsys: pytest.CaptureFixture[str]
) -> None:
    args = argparse.Namespace(
        config=str(tmp_path / "missing.yaml"),
        topic=None,
        output=None,
        from_stage=None,
        auto_approve=False,
        skip_preflight=True,
        resume=False,
        skip_noncritical_stage=False,
    )
    code = rc_cli.cmd_run(args)
    assert code == 1
    assert "config file not found" in capsys.readouterr().err


def test_cmd_validate_missing_config_returns_one(
    tmp_path: Path, capsys: pytest.CaptureFixture[str]
) -> None:
    args = argparse.Namespace(
        config=str(tmp_path / "missing.yaml"), no_check_paths=False
    )
    code = rc_cli.cmd_validate(args)
    assert code == 1
    assert "config file not found" in capsys.readouterr().err


def test_cmd_validate_valid_config_returns_zero(
    tmp_path: Path, capsys: pytest.CaptureFixture[str]
) -> None:
    config_path = tmp_path / "config.yaml"
    _write_valid_config(config_path)
    args = argparse.Namespace(config=str(config_path), no_check_paths=True)
    code = rc_cli.cmd_validate(args)
    assert code == 0
    assert "Config validation passed" in capsys.readouterr().out


def test_main_dispatches_run_command(monkeypatch: pytest.MonkeyPatch) -> None:
    captured = {}

    def fake_cmd_run(args):
        captured["args"] = args
        return 0

    monkeypatch.setattr(rc_cli, "cmd_run", fake_cmd_run)
    code = rc_cli.main(
        [
            "run",
            "--topic",
            "new topic",
            "--config",
            "cfg.yaml",
            "--output",
            "out-dir",
            "--from-stage",
            "PAPER_OUTLINE",
            "--auto-approve",
        ]
    )
    assert code == 0
    parsed = captured["args"]
    assert parsed.topic == "new topic"
    assert parsed.config == "cfg.yaml"
    assert parsed.output == "out-dir"
    assert parsed.from_stage == "PAPER_OUTLINE"
    assert parsed.auto_approve is True


def test_main_dispatches_validate_command(monkeypatch: pytest.MonkeyPatch) -> None:
    captured = {}

    def fake_cmd_validate(args):
        captured["args"] = args
        return 0

    monkeypatch.setattr(rc_cli, "cmd_validate", fake_cmd_validate)
    code = rc_cli.main(["validate", "--config", "cfg.yaml", "--no-check-paths"])
    assert code == 0
    parsed = captured["args"]
    assert parsed.config == "cfg.yaml"
    assert parsed.no_check_paths is True


@pytest.mark.parametrize(
    "argv",
    [
        ["run", "--topic", "x", "--config", "c.yaml"],
        ["run", "--output", "out", "--config", "c.yaml"],
        ["run", "--from-stage", "TOPIC_INIT", "--config", "c.yaml"],
        ["run", "--auto-approve", "--config", "c.yaml"],
    ],
)
def test_run_parser_accepts_required_flags(
    argv: list[str], monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.setattr(rc_cli, "cmd_run", lambda args: 0)
    assert rc_cli.main(argv) == 0


def test_validate_parser_accepts_config_flag(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setattr(rc_cli, "cmd_validate", lambda args: 0)
    assert rc_cli.main(["validate", "--config", "cfg.yaml"]) == 0


# --- resolve_config_path tests ---


def test_resolve_config_finds_arc_yaml_first(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.chdir(tmp_path)
    (tmp_path / "config.arc.yaml").write_text("x: 1\n")
    (tmp_path / "config.yaml").write_text("x: 2\n")
    result = resolve_config_path(None)
    assert result is not None
    assert result.name == "config.arc.yaml"


def test_resolve_config_falls_back_to_config_yaml(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.chdir(tmp_path)
    (tmp_path / "config.yaml").write_text("x: 1\n")
    result = resolve_config_path(None)
    assert result is not None
    assert result.name == "config.yaml"


def test_resolve_config_returns_none_when_missing(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.chdir(tmp_path)
    result = resolve_config_path(None)
    assert result is None


def test_resolve_config_explicit_path_no_search() -> None:
    result = resolve_config_path("/some/explicit/path.yaml")
    assert result is not None
    assert str(result) == "/some/explicit/path.yaml"


# --- cmd_init tests ---


def _write_example_config(path: Path) -> None:
    path.write_text(
        """\
project:
  name: "my-research"
llm:
  provider: "openai"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"
  primary_model: "gpt-4o"
  fallback_models:
    - "gpt-4.1"
    - "gpt-4o-mini"
""",
        encoding="utf-8",
    )


def test_cmd_init_creates_config(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
    monkeypatch.chdir(tmp_path)
    _write_example_config(tmp_path / "config.researchclaw.example.yaml")
    # Simulate non-TTY (stdin not a tty) → defaults to openai
    monkeypatch.setattr("sys.stdin", type("FakeStdin", (), {"isatty": lambda self: False})())
    args = argparse.Namespace(force=False)
    code = rc_cli.cmd_init(args)
    assert code == 0
    created = tmp_path / "config.arc.yaml"
    assert created.exists()
    content = created.read_text()
    assert 'provider: "openai"' in content
    assert "Created config.arc.yaml" in capsys.readouterr().out


def test_cmd_init_refuses_overwrite(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
    monkeypatch.chdir(tmp_path)
    _write_example_config(tmp_path / "config.researchclaw.example.yaml")
    (tmp_path / "config.arc.yaml").write_text("existing\n")
    args = argparse.Namespace(force=False)
    code = rc_cli.cmd_init(args)
    assert code == 1
    assert "already exists" in capsys.readouterr().err
    assert (tmp_path / "config.arc.yaml").read_text() == "existing\n"


def test_cmd_init_force_overwrites(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.chdir(tmp_path)
    _write_example_config(tmp_path / "config.researchclaw.example.yaml")
    (tmp_path / "config.arc.yaml").write_text("old\n")
    monkeypatch.setattr("sys.stdin", type("FakeStdin", (), {"isatty": lambda self: False})())
    args = argparse.Namespace(force=True)
    code = rc_cli.cmd_init(args)
    assert code == 0
    assert (tmp_path / "config.arc.yaml").read_text() != "old\n"


def test_cmd_run_missing_config_shows_init_hint(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
    monkeypatch.chdir(tmp_path)
    args = argparse.Namespace(
        config=None,
        topic=None,
        output=None,
        from_stage=None,
        auto_approve=False,
        skip_preflight=True,
        resume=False,
        skip_noncritical_stage=False,
    )
    code = rc_cli.cmd_run(args)
    assert code == 1
    assert "researchclaw init" in capsys.readouterr().err


def test_resume_finds_existing_checkpoint_dir(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
    """BUG-119: --resume without --output should find the latest checkpoint dir."""
    import hashlib
    import json

    monkeypatch.chdir(tmp_path)

    # Write a valid config
    config_path = tmp_path / "config.arc.yaml"
    _write_valid_config(config_path)

    # Create a fake previous run directory with a checkpoint
    topic = "Synthetic benchmark research"  # matches _write_valid_config
    topic_hash = hashlib.sha256(topic.encode()).hexdigest()[:6]
    old_run_dir = tmp_path / "artifacts" / f"rc-20260319-100000-{topic_hash}"
    old_run_dir.mkdir(parents=True)
    (old_run_dir / "checkpoint.json").write_text(
        json.dumps({"last_completed_stage": 5, "last_completed_name": "HYPOTHESIS_GEN",
                     "run_id": old_run_dir.name, "timestamp": "2026-03-19T10:00:00Z"})
    )

    # Mock execute_pipeline so we don't actually run
    import researchclaw.pipeline.runner as runner_mod
    monkeypatch.setattr(runner_mod, "execute_pipeline", lambda **kw: [])

    # Also mock preflight
    from unittest.mock import MagicMock
    mock_client = MagicMock()
    mock_client.preflight.return_value = (True, "OK")
    import researchclaw.llm as llm_mod
    monkeypatch.setattr(llm_mod, "create_llm_client", lambda cfg: mock_client)

    args = argparse.Namespace(
        config=str(config_path),
        topic=None,
        output=None,
        from_stage=None,
        auto_approve=False,
        skip_preflight=True,
        resume=True,
        skip_noncritical_stage=False,
        no_graceful_degradation=False,
    )
    rc_cli.cmd_run(args)
    captured = capsys.readouterr()
    assert "Found existing run to resume" in captured.out
    assert old_run_dir.name in captured.out


def test_resume_no_checkpoint_warns(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
    """BUG-119: --resume with no matching checkpoint should warn and start new."""
    monkeypatch.chdir(tmp_path)

    config_path = tmp_path / "config.arc.yaml"
    _write_valid_config(config_path)

    # Create empty artifacts dir (no checkpoints)
    (tmp_path / "artifacts").mkdir()

    import researchclaw.pipeline.runner as runner_mod
    monkeypatch.setattr(runner_mod, "execute_pipeline", lambda **kw: [])

    from unittest.mock import MagicMock
    mock_client = MagicMock()
    mock_client.preflight.return_value = (True, "OK")
    import researchclaw.llm as llm_mod
    monkeypatch.setattr(llm_mod, "create_llm_client", lambda cfg: mock_client)

    args = argparse.Namespace(
        config=str(config_path),
        topic=None,
        output=None,
        from_stage=None,
        auto_approve=False,
        skip_preflight=True,
        resume=True,
        skip_noncritical_stage=False,
        no_graceful_degradation=False,
    )
    rc_cli.cmd_run(args)
    captured = capsys.readouterr()
    assert "no checkpoint found" in captured.err


def test_main_dispatches_init(monkeypatch: pytest.MonkeyPatch) -> None:
    captured = {}

    def fake_cmd_init(args):
        captured["args"] = args
        return 0

    monkeypatch.setattr(rc_cli, "cmd_init", fake_cmd_init)
    code = rc_cli.main(["init", "--force"])
    assert code == 0
    assert captured["args"].force is True


================================================
FILE: tests/test_rc_config.py
================================================
import json
from pathlib import Path
from typing import cast

import pytest

from researchclaw.config import (
    ExperimentConfig,
    RCConfig,
    SandboxConfig,
    SecurityConfig,
    ValidationResult,
    load_config,
    validate_config,
)


def _write_valid_config(tmp_path: Path) -> Path:
    kb_root = tmp_path / "docs" / "kb"
    for name in (
        "questions",
        "literature",
        "experiments",
        "findings",
        "decisions",
        "reviews",
    ):
        (kb_root / name).mkdir(parents=True, exist_ok=True)

    config_path = tmp_path / "config.rc.yaml"
    _ = config_path.write_text(
        """
project:
  name: demo
  mode: docs-first
research:
  topic: Test topic
  domains: [ml, agents]
runtime:
  timezone: America/New_York
notifications:
  channel: discord
knowledge_base:
  backend: markdown
  root: docs/kb
openclaw_bridge:
  use_cron: true
  use_message: true
  use_memory: true
  use_sessions_spawn: true
  use_web_fetch: true
  use_browser: false
llm:
  provider: openai-compatible
  base_url: https://example.invalid/v1
  api_key_env: OPENAI_API_KEY
security:
  hitl_required_stages: [5, 9, 20]
experiment:
  mode: simulated
""".strip()
        + "\n",
        encoding="utf-8",
    )
    return config_path


def _valid_config_data() -> dict[str, dict[str, object]]:
    return {
        "project": {"name": "demo", "mode": "docs-first"},
        "research": {"topic": "Test topic", "domains": ["ml", "agents"]},
        "runtime": {"timezone": "America/New_York"},
        "notifications": {"channel": "discord"},
        "knowledge_base": {"backend": "markdown", "root": "docs/kb"},
        "openclaw_bridge": {
            "use_cron": True,
            "use_message": True,
            "use_memory": True,
            "use_sessions_spawn": True,
            "use_web_fetch": True,
            "use_browser": False,
        },
        "llm": {
            "provider": "openai-compatible",
            "base_url": "https://example.invalid/v1",
            "api_key_env": "OPENAI_API_KEY",
            "primary_model": "gpt-4.1",
            "fallback_models": ["gpt-4o-mini", "gpt-4o"],
        },
        "security": {"hitl_required_stages": [5, 9, 20]},
        "experiment": {
            "mode": "simulated",
            "metric_direction": "minimize",
        },
    }


def test_valid_config_data_helper_returns_expected_baseline_shape():
    data = _valid_config_data()
    assert data["project"]["name"] == "demo"
    assert data["knowledge_base"]["root"] == "docs/kb"
    assert data["security"]["hitl_required_stages"] == [5, 9, 20]


def test_validate_config_with_valid_data_returns_ok_true(tmp_path: Path):
    result = validate_config(
        _valid_config_data(), project_root=tmp_path, check_paths=False
    )

    assert isinstance(result, ValidationResult)
    assert result.ok is True
    assert result.errors == ()


def test_validate_config_missing_required_fields_returns_errors(tmp_path: Path):
    data = _valid_config_data()
    data["research"] = {}

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "Missing required field: research.topic" in result.errors


def test_validate_config_rejects_invalid_project_mode(tmp_path: Path):
    data = _valid_config_data()
    data["project"]["mode"] = "invalid-mode"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "Invalid project.mode: invalid-mode" in result.errors


def test_validate_config_rejects_invalid_knowledge_base_backend(tmp_path: Path):
    data = _valid_config_data()
    data["knowledge_base"]["backend"] = "sqlite"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "Invalid knowledge_base.backend: sqlite" in result.errors


@pytest.mark.parametrize("entry", [0, 24, "5", 9.1])
def test_validate_config_rejects_invalid_hitl_required_stages_entries(
    tmp_path: Path, entry: object
):
    data = _valid_config_data()
    data["security"]["hitl_required_stages"] = [5, entry, 20]

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert f"Invalid security.hitl_required_stages entry: {entry}" in result.errors


def test_validate_config_rejects_non_list_hitl_required_stages(tmp_path: Path):
    data = _valid_config_data()
    data["security"]["hitl_required_stages"] = "5,9,20"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "security.hitl_required_stages must be a list" in result.errors


def test_validate_config_rejects_invalid_experiment_mode(tmp_path: Path):
    data = _valid_config_data()
    data["experiment"]["mode"] = "kubernetes"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "Invalid experiment.mode: kubernetes" in result.errors


def test_validate_config_accepts_docker_mode(tmp_path: Path):
    data = _valid_config_data()
    data["experiment"]["mode"] = "docker"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is True


def test_validate_config_rejects_invalid_metric_direction(tmp_path: Path):
    data = _valid_config_data()
    data["experiment"]["metric_direction"] = "upward"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is False
    assert "Invalid experiment.metric_direction: upward" in result.errors


def test_rcconfig_from_dict_happy_path(tmp_path: Path):
    config = RCConfig.from_dict(
        _valid_config_data(),
        project_root=tmp_path,
        check_paths=False,
    )

    assert isinstance(config, RCConfig)
    assert config.project.name == "demo"
    assert config.research.domains == ("ml", "agents")
    assert config.llm.fallback_models == ("gpt-4o-mini", "gpt-4o")


def test_rcconfig_from_dict_missing_fields_raises_value_error(tmp_path: Path):
    data = _valid_config_data()
    del data["runtime"]

    with pytest.raises(ValueError, match="Missing required field: runtime.timezone"):
        _ = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)


def test_rcconfig_load_from_yaml_file(tmp_path: Path):
    config_path = _write_valid_config(tmp_path)
    config = RCConfig.load(config_path, project_root=tmp_path)

    assert isinstance(config, RCConfig)
    assert config.project.name == "demo"
    assert config.knowledge_base.root == "docs/kb"


def test_load_config_wrapper_returns_rcconfig(tmp_path: Path):
    config_path = _write_valid_config(tmp_path)
    config = load_config(config_path, project_root=tmp_path)

    assert isinstance(config, RCConfig)
    assert config.security.hitl_required_stages == (5, 9, 20)


def test_security_config_defaults_match_expected_values():
    defaults = SecurityConfig()

    assert defaults.hitl_required_stages == (5, 9, 20)
    assert defaults.allow_publish_without_approval is False
    assert defaults.redact_sensitive_logs is True


def test_experiment_config_defaults_mode_is_simulated():
    defaults = ExperimentConfig()

    assert defaults.mode == "simulated"
    assert defaults.metric_direction == "minimize"


def test_sandbox_config_defaults_match_expected_values():
    from researchclaw.config import DEFAULT_PYTHON_PATH
    defaults = SandboxConfig()

    assert defaults.python_path == DEFAULT_PYTHON_PATH
    assert defaults.gpu_required is False
    assert defaults.max_memory_mb == 4096
    assert "numpy" in defaults.allowed_imports


def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path):
    original = RCConfig.from_dict(
        _valid_config_data(),
        project_root=tmp_path,
        check_paths=False,
    )

    normalized = cast(dict[str, object], json.loads(json.dumps(original.to_dict())))

    rehydrated = RCConfig.from_dict(
        normalized,
        project_root=tmp_path,
        check_paths=False,
    )

    assert rehydrated == original
    assert isinstance(original.to_dict()["security"]["hitl_required_stages"], tuple)


def test_check_paths_false_skips_missing_kb_root_validation(tmp_path: Path):
    data = _valid_config_data()
    data["knowledge_base"]["root"] = "docs/missing-kb"

    result = validate_config(data, project_root=tmp_path, check_paths=False)

    assert result.ok is True
    assert not any(error.startswith("Missing path:") for error in result.errors)


def test_path_validation_missing_kb_root_is_error(tmp_path: Path):
    result = validate_config(
        _valid_config_data(), project_root=tmp_path, check_paths=True
    )

    assert result.ok is False
    assert any(error.startswith("Missing path:") for error in result.errors)


def test_validate_config_missing_kb_subdirs_emits_warnings(tmp_path: Path):
    data = _valid_config_data()
    _ = (tmp_path / "docs" / "kb").mkdir(parents=True)

    result = validate_config(data, project_root=tmp_path, check_paths=True)

    assert result.ok is True
    assert len(result.warnings) == 6
    assert all(
        warning.startswith("Missing recommended kb subdir:")
        for warning in result.warnings
    )


def test_rcconfig_from_dict_uses_default_security_when_missing(tmp_path: Path):
    data = _valid_config_data()
    del data["security"]

    config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
    assert config.security.hitl_required_stages == (5, 9, 20)


def test_load_uses_file_parent_as_default_project_root(tmp_path: Path):
    config_path = _write_valid_config(tmp_path)
    config = RCConfig.load(config_path)

    assert config.project.name == "demo"
    assert config.knowledge_base.root == "docs/kb"


================================================
FILE: tests/test_rc_contracts.py
================================================
import re

import pytest

from researchclaw.pipeline.contracts import CONTRACTS, StageContract
from researchclaw.pipeline.stages import GATE_STAGES, STAGE_SEQUENCE, Stage


def test_contracts_dict_has_exactly_23_entries():
    assert len(CONTRACTS) == 23


def test_every_stage_has_matching_contract_entry():
    assert set(CONTRACTS.keys()) == set(Stage)


@pytest.mark.parametrize("stage", STAGE_SEQUENCE)
def test_each_stage_member_resolves_to_stage_contract(stage: Stage):
    assert isinstance(CONTRACTS[stage], StageContract)


@pytest.mark.parametrize("stage,contract", tuple(CONTRACTS.items()))
def test_contract_stage_field_matches_dict_key(stage: Stage, contract: StageContract):
    assert contract.stage is stage


@pytest.mark.parametrize("contract", tuple(CONTRACTS.values()))
def test_output_files_is_non_empty_for_all_contracts(contract: StageContract):
    assert contract.output_files


@pytest.mark.parametrize("stage,contract", tuple(CONTRACTS.items()))
def test_error_code_starts_with_e_and_contains_stage_number(
    stage: Stage, contract: StageContract
):
    assert contract.error_code.startswith("E")
    assert f"{int(stage):02d}" in contract.error_code
    assert re.match(r"^E\d{2}_[A-Z0-9_]+$", contract.error_code)


@pytest.mark.parametrize("contract", tuple(CONTRACTS.values()))
def test_max_retries_is_non_negative_for_all_contracts(contract: StageContract):
    assert contract.max_retries >= 0


def test_gate_stages_have_expected_max_retries():
    assert CONTRACTS[Stage.LITERATURE_SCREEN].max_retries == 0
    assert CONTRACTS[Stage.EXPERIMENT_DESIGN].max_retries == 0
    assert CONTRACTS[Stage.QUALITY_GATE].max_retries == 0


@pytest.mark.parametrize("stage", tuple(GATE_STAGES))
def test_gate_stage_contracts_are_never_retried(stage: Stage):
    assert CONTRACTS[stage].max_retries == 0


def test_topic_init_contract_has_expected_input_output_files():
    contract = CONTRACTS[Stage.TOPIC_INIT]

    assert contract.input_files == ()
    assert contract.output_files == ("goal.md", "hardware_profile.json")


def test_export_publish_contract_has_expected_outputs():
    contract = CONTRACTS[Stage.EXPORT_PUBLISH]

    assert contract.output_files == ("paper_final.md", "code/")


@pytest.mark.parametrize("contract", tuple(CONTRACTS.values()))
def test_dod_is_non_empty_string_for_all_contracts(contract: StageContract):
    assert isinstance(contract.dod, str)
    assert contract.dod.strip()


@pytest.mark.parametrize("contract", tuple(CONTRACTS.values()))
def test_input_files_is_tuple_of_strings(contract: StageContract):
    assert isinstance(contract.input_files, tuple)
    assert all(isinstance(path, str) and path for path in contract.input_files)


@pytest.mark.parametrize("contract", tuple(CONTRACTS.values()))
def test_output_files_is_tuple_of_strings(contract: StageContract):
    assert isinstance(contract.output_files, tuple)
    assert all(isinstance(path, str) and path for path in contract.output_files)


def test_error_codes_are_unique_across_contracts():
    all_codes = [contract.error_code for contract in CONTRACTS.values()]
    assert len(all_codes) == len(set(all_codes))


def test_contracts_follow_stage_sequence_order():
    assert tuple(CONTRACTS.keys()) == STAGE_SEQUENCE


@pytest.mark.parametrize("stage", STAGE_SEQUENCE)
def test_contract_stage_int_matches_stage_enum_value(stage: Stage):
    assert int(CONTRACTS[stage].stage) == int(stage)


================================================
FILE: tests/test_rc_docker_sandbox.py
================================================
"""Tests for DockerSandbox — all mocked, no real Docker needed."""

from __future__ import annotations

import subprocess
import threading
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.config import DockerSandboxConfig, ExperimentConfig
from researchclaw.experiment.docker_sandbox import DockerSandbox, _next_container_name
from researchclaw.experiment.factory import create_sandbox
from researchclaw.experiment.sandbox import SandboxResult


# ── SandboxResult contract ─────────────────────────────────────────────


def test_sandbox_result_fields():
    r = SandboxResult(
        returncode=0,
        stdout="primary_metric: 0.95\n",
        stderr="",
        elapsed_sec=1.2,
        metrics={"primary_metric": 0.95},
        timed_out=False,
    )
    assert r.returncode == 0
    assert r.metrics["primary_metric"] == 0.95
    assert r.timed_out is False


# ── DockerSandbox command building ─────────────────────────────────────


def test_build_run_command_network_none(tmp_path: Path):
    """network_policy='none' → --network none, --user UID:GID."""
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    cmd = sandbox._build_run_command(
        tmp_path / "staging",
        entry_point="main.py",
        container_name="rc-test-1",
    )
    assert "docker" in cmd
    assert "--gpus" in cmd
    assert "--network" in cmd
    assert "none" in cmd
    assert "--memory=8192m" in cmd
    assert "--shm-size=2048m" in cmd
    assert cmd[-1] == "main.py"
    # Should contain --user (non-root)
    assert "--user" in cmd


def test_build_run_command_setup_only(tmp_path: Path):
    """Default network_policy='setup_only' → RC_SETUP_ONLY_NETWORK=1, --cap-add."""
    cfg = DockerSandboxConfig()  # default is setup_only
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    cmd = sandbox._build_run_command(
        tmp_path / "staging",
        entry_point="main.py",
        container_name="rc-test-setup",
    )
    # Should set env var for setup-only network
    assert "-e" in cmd
    env_idx = [i for i, x in enumerate(cmd) if x == "-e"]
    env_values = [cmd[i + 1] for i in env_idx]
    assert "RC_SETUP_ONLY_NETWORK=1" in env_values
    # Should add NET_ADMIN capability
    assert "--cap-add=NET_ADMIN" in cmd
    # Should NOT have --network none (needs network for setup)
    network_indices = [i for i, x in enumerate(cmd) if x == "--network"]
    assert len(network_indices) == 0
    # Should have --user (runs as host user so experiment can write results.json)
    assert "--user" in cmd


def test_build_run_command_full_network(tmp_path: Path):
    """network_policy='full' → no --network none, has --user."""
    cfg = DockerSandboxConfig(network_policy="full")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    cmd = sandbox._build_run_command(
        tmp_path / "staging",
        entry_point="main.py",
        container_name="rc-test-full",
    )
    # No --network none
    network_indices = [i for i, x in enumerate(cmd) if x == "--network"]
    assert len(network_indices) == 0
    # Should have --user (non-root)
    assert "--user" in cmd


def test_build_run_command_no_gpu(tmp_path: Path):
    cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    cmd = sandbox._build_run_command(
        tmp_path / "staging",
        entry_point="main.py",
        container_name="rc-test-2",
    )
    assert "--gpus" not in cmd


def test_build_run_command_specific_gpus(tmp_path: Path):
    cfg = DockerSandboxConfig(gpu_device_ids=(0, 2), network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    cmd = sandbox._build_run_command(
        tmp_path / "staging",
        entry_point="main.py",
        container_name="rc-test-3",
    )
    assert "--gpus" in cmd
    gpu_idx = cmd.index("--gpus")
    assert "0,2" in cmd[gpu_idx + 1]


# ── Harness injection ─────────────────────────────────────────────────


def test_harness_injection(tmp_path: Path):
    harness_src = Path(__file__).parent.parent / "researchclaw" / "experiment" / "harness_template.py"
    if not harness_src.exists():
        pytest.skip("harness_template.py not found")

    target = tmp_path / "project"
    target.mkdir()
    DockerSandbox._inject_harness(target)
    assert (target / "experiment_harness.py").exists()


# ── Factory ────────────────────────────────────────────────────────────


def test_factory_returns_experiment_sandbox(tmp_path: Path):
    from researchclaw.experiment.sandbox import ExperimentSandbox

    config = ExperimentConfig(mode="sandbox")
    sandbox = create_sandbox(config, tmp_path / "work")
    assert isinstance(sandbox, ExperimentSandbox)


@patch("researchclaw.experiment.docker_sandbox.DockerSandbox.ensure_image", return_value=True)
@patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=True)
def test_factory_returns_docker_sandbox(mock_avail, mock_image, tmp_path: Path):
    config = ExperimentConfig(mode="docker")
    sandbox = create_sandbox(config, tmp_path / "work")
    assert isinstance(sandbox, DockerSandbox)


@patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=False)
def test_factory_falls_back_when_docker_unavailable(mock_avail, tmp_path: Path):
    config = ExperimentConfig(mode="docker")
    sandbox = create_sandbox(config, tmp_path / "work")
    # BUG-002: Should fall back to subprocess sandbox instead of raising
    from researchclaw.experiment.sandbox import ExperimentSandbox
    assert isinstance(sandbox, ExperimentSandbox)


@patch("researchclaw.experiment.docker_sandbox.DockerSandbox.ensure_image", return_value=False)
@patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=True)
def test_factory_raises_when_image_missing(mock_avail, mock_image, tmp_path: Path):
    config = ExperimentConfig(mode="docker")
    with pytest.raises(RuntimeError, match="not found locally"):
        create_sandbox(config, tmp_path / "work")


# ── run() with mocked subprocess ──────────────────────────────────────


@patch("subprocess.run")
def test_docker_run_success(mock_run, tmp_path: Path):
    mock_run.return_value = subprocess.CompletedProcess(
        args=["docker", "run"],
        returncode=0,
        stdout="primary_metric: 0.85\n",
        stderr="",
    )
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run("print('hello')", timeout_sec=60)

    assert result.returncode == 0
    assert result.metrics.get("primary_metric") == 0.85
    assert result.timed_out is False


@patch("subprocess.run")
def test_docker_run_timeout(mock_run, tmp_path: Path):
    mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker run", timeout=10)
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run("import time; time.sleep(999)", timeout_sec=10)

    assert result.timed_out is True
    assert result.returncode == -1


# ── Dep detection ─────────────────────────────────────────────────────


def test_detect_pip_packages(tmp_path: Path):
    (tmp_path / "main.py").write_text(
        "import torchdiffeq\nimport numpy\nfrom PIL import Image\n"
    )
    detected = DockerSandbox._detect_pip_packages(tmp_path)
    # torchdiffeq and PIL/Pillow are now in builtin → skipped
    # numpy should be skipped (builtin)
    assert "numpy" not in detected
    assert "torchdiffeq" not in detected


def test_detect_pip_packages_finds_unknown(tmp_path: Path):
    """Unknown packages should be detected."""
    (tmp_path / "main.py").write_text(
        "import some_new_package\nimport numpy\n"
    )
    detected = DockerSandbox._detect_pip_packages(tmp_path)
    assert "some_new_package" in detected
    assert "numpy" not in detected


def test_detect_pip_packages_skips_setup_py(tmp_path: Path):
    """setup.py should not be scanned for experiment deps."""
    (tmp_path / "setup.py").write_text("import some_setup_dep\n")
    (tmp_path / "main.py").write_text("import numpy\n")
    detected = DockerSandbox._detect_pip_packages(tmp_path)
    assert "some_setup_dep" not in detected


def test_detect_pip_packages_maps_imports(tmp_path: Path):
    """Known import-to-pip mappings should be applied."""
    (tmp_path / "main.py").write_text(
        "import cv2\nimport wandb\n"
    )
    detected = DockerSandbox._detect_pip_packages(tmp_path)
    assert "opencv-python" in detected
    assert "wandb" in detected


def test_next_container_name_is_thread_safe():
    names: list[str] = []
    lock = threading.Lock()

    def worker() -> None:
        for _ in range(20):
            name = _next_container_name()
            with lock:
                names.append(name)

    threads = [threading.Thread(target=worker) for _ in range(5)]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    assert len(names) == 100
    assert len(names) == len(set(names))


# ── requirements.txt generation ──────────────────────────────────────


def test_write_requirements_txt_from_auto_detect(tmp_path: Path):
    """Auto-detected packages should be written to requirements.txt."""
    staging = tmp_path / "staging"
    staging.mkdir()
    (staging / "main.py").write_text("import wandb\nimport optuna\n")

    cfg = DockerSandboxConfig(auto_install_deps=True)
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    sandbox._write_requirements_txt(staging)

    req_path = staging / "requirements.txt"
    assert req_path.exists()
    content = req_path.read_text()
    assert "wandb" in content
    assert "optuna" in content


def test_write_requirements_txt_with_pip_pre_install(tmp_path: Path):
    """pip_pre_install packages should be added to requirements.txt."""
    staging = tmp_path / "staging"
    staging.mkdir()
    (staging / "main.py").write_text("import numpy\n")

    cfg = DockerSandboxConfig(pip_pre_install=("einops==0.8.0", "kornia"))
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    sandbox._write_requirements_txt(staging)

    req_path = staging / "requirements.txt"
    assert req_path.exists()
    content = req_path.read_text()
    assert "einops==0.8.0" in content
    assert "kornia" in content


def test_write_requirements_txt_respects_existing(tmp_path: Path):
    """If LLM already generated requirements.txt, append only new packages."""
    staging = tmp_path / "staging"
    staging.mkdir()
    (staging / "main.py").write_text("import numpy\n")
    (staging / "requirements.txt").write_text("wandb\n")

    cfg = DockerSandboxConfig(pip_pre_install=("wandb", "einops"))
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    sandbox._write_requirements_txt(staging)

    content = (staging / "requirements.txt").read_text()
    # wandb already in existing file, should not be duplicated
    assert content.count("wandb") == 1
    # einops should be appended
    assert "einops" in content


def test_write_requirements_txt_no_packages(tmp_path: Path):
    """No requirements.txt if no packages needed."""
    staging = tmp_path / "staging"
    staging.mkdir()
    (staging / "main.py").write_text("import numpy\n")

    cfg = DockerSandboxConfig()
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    sandbox._write_requirements_txt(staging)

    assert not (staging / "requirements.txt").exists()


# ── Static checks (mocked) ────────────────────────────────────────────


@patch("subprocess.run")
def test_check_docker_available_true(mock_run):
    mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0)
    assert DockerSandbox.check_docker_available() is True


@patch("subprocess.run")
def test_check_docker_available_false(mock_run):
    mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=1)
    assert DockerSandbox.check_docker_available() is False


@patch("subprocess.run", side_effect=FileNotFoundError)
def test_check_docker_available_no_binary(mock_run):
    assert DockerSandbox.check_docker_available() is False


@patch("subprocess.run")
def test_ensure_image_true(mock_run):
    mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0)
    assert DockerSandbox.ensure_image("researchclaw/experiment:latest") is True


@patch("subprocess.run")
def test_ensure_image_false(mock_run):
    mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=1)
    assert DockerSandbox.ensure_image("nonexistent:latest") is False


# ── Default config values ────────────────────────────────────────────


def test_default_network_policy_is_setup_only():
    """Default network_policy should be 'setup_only', not 'none'."""
    cfg = DockerSandboxConfig()
    assert cfg.network_policy == "setup_only"


def test_default_auto_install_deps_enabled():
    cfg = DockerSandboxConfig()
    assert cfg.auto_install_deps is True


# ── Entry point path traversal validation ─────────────────────────────


@patch("researchclaw.experiment.docker_sandbox.subprocess.run")
def test_run_project_rejects_path_traversal(mock_run: MagicMock, tmp_path: Path):
    """run_project() must reject entry_point with '..' components."""
    project = tmp_path / "proj"
    project.mkdir()
    (project / "main.py").write_text("print('hi')")

    cfg = DockerSandboxConfig()
    work = tmp_path / "work"
    sandbox = DockerSandbox(cfg, work)
    # Create escape target so .exists() alone wouldn't catch it
    work.mkdir(parents=True, exist_ok=True)
    (work / "escape.py").write_text("print('escaped!')")

    result = sandbox.run_project(project, entry_point="../escape.py")

    assert result.returncode == -1
    assert ".." in result.stderr
    mock_run.assert_not_called()


@patch("researchclaw.experiment.docker_sandbox.subprocess.run")
def test_run_project_rejects_absolute_path(mock_run: MagicMock, tmp_path: Path):
    """run_project() must reject absolute entry_point paths."""
    project = tmp_path / "proj"
    project.mkdir()
    (project / "main.py").write_text("print('hi')")

    cfg = DockerSandboxConfig()
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run_project(project, entry_point="/etc/passwd")

    assert result.returncode == -1
    assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower()
    mock_run.assert_not_called()


# ── Container cleanup behavior ────────────────────────────────────────


@patch.object(DockerSandbox, "_remove_container")
@patch("subprocess.run")
def test_cleanup_on_normal_exit(mock_run: MagicMock, mock_remove: MagicMock, tmp_path: Path):
    """_remove_container is called on normal successful exit."""
    mock_run.return_value = subprocess.CompletedProcess(
        args=["docker", "run"], returncode=0, stdout="metric: 1.0\n", stderr="",
    )
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run("print('ok')", timeout_sec=60)

    assert result.returncode == 0
    mock_remove.assert_called_once()


@patch.object(DockerSandbox, "_remove_container")
@patch.object(DockerSandbox, "_kill_container")
@patch("subprocess.run")
def test_cleanup_on_timeout(
    mock_run: MagicMock, mock_kill: MagicMock, mock_remove: MagicMock, tmp_path: Path,
):
    """Both _kill_container and _remove_container are called on timeout."""
    mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker run", timeout=10)
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run("import time; time.sleep(999)", timeout_sec=10)

    assert result.timed_out is True
    mock_kill.assert_called_once()
    mock_remove.assert_called_once()


@patch.object(DockerSandbox, "_remove_container")
@patch("subprocess.run")
def test_cleanup_on_exception(mock_run: MagicMock, mock_remove: MagicMock, tmp_path: Path):
    """_remove_container is called even when subprocess.run raises an unexpected exception."""
    mock_run.side_effect = OSError("Docker daemon not responding")
    cfg = DockerSandboxConfig(network_policy="none")
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    result = sandbox.run("print('hi')", timeout_sec=60)

    assert result.returncode == -1
    assert "Docker execution error" in result.stderr
    mock_remove.assert_called_once()


@patch.object(DockerSandbox, "_remove_container")
@patch.object(DockerSandbox, "_kill_container")
@patch("subprocess.run")
def test_keep_containers_skips_removal(
    mock_run: MagicMock, mock_kill: MagicMock, mock_remove: MagicMock, tmp_path: Path,
):
    """When keep_containers=True, _remove_container is never called."""
    mock_run.return_value = subprocess.CompletedProcess(
        args=["docker", "run"], returncode=0, stdout="", stderr="",
    )
    cfg = DockerSandboxConfig(network_policy="none", keep_containers=True)
    sandbox = DockerSandbox(cfg, tmp_path / "work")
    sandbox.run("print('ok')", timeout_sec=60)

    mock_remove.assert_not_called()


================================================
FILE: tests/test_rc_e2e_regression.py
================================================
# pyright: reportMissingImports=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownVariableType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportPrivateUsage=false, reportUnknownLambdaType=false
from __future__ import annotations

import json
import urllib.error
from email.message import Message
from pathlib import Path
from unittest.mock import patch

import pytest


class _DummyResponse:
    def __init__(self, payload: bytes) -> None:
        self._payload: bytes = payload

    def read(self) -> bytes:
        return self._payload

    def __enter__(self) -> _DummyResponse:
        return self

    def __exit__(self, exc_type, exc, tb) -> None:
        _ = exc_type, exc, tb
        return None


class TestRateLimitRetry:
    def test_s2_429_retries_and_succeeds(self) -> None:
        from researchclaw.literature.semantic_scholar import (
            _reset_circuit_breaker,
            search_semantic_scholar,
        )

        _reset_circuit_breaker()  # ensure clean CB state from prior tests

        call_count = 0

        def mock_urlopen(req, **kwargs):
            _ = kwargs
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                raise urllib.error.HTTPError(
                    req.full_url if hasattr(req, "full_url") else str(req),
                    429,
                    "Too Many Requests",
                    Message(),
                    None,
                )

            payload = json.dumps(
                {
                    "data": [
                        {
                            "paperId": "abc123",
                            "title": "Test Paper",
                            "authors": [{"name": "Smith"}],
                            "year": 2024,
                            "abstract": "test abstract",
                            "venue": "NeurIPS",
                            "citationCount": 10,
                            "externalIds": {"DOI": "10.1234/test"},
                            "url": "https://example.com",
                        }
                    ]
                }
            ).encode("utf-8")
            return _DummyResponse(payload)

        with patch("urllib.request.urlopen", side_effect=mock_urlopen):
            with patch("time.sleep"):
                papers = search_semantic_scholar("test query", limit=5)

        assert call_count >= 2
        assert len(papers) == 1

    def test_s2_persistent_429_exhausts_retries_and_returns_empty(self) -> None:
        from researchclaw.literature.semantic_scholar import (
            _MAX_RETRIES,
            _reset_circuit_breaker,
            search_semantic_scholar,
        )

        _reset_circuit_breaker()  # ensure clean CB state from prior tests
        call_count = 0

        def mock_urlopen(req, **kwargs):
            _ = kwargs
            nonlocal call_count
            call_count += 1
            raise urllib.error.HTTPError(
                req.full_url if hasattr(req, "full_url") else str(req),
                429,
                "Too Many Requests",
                Message(),
                None,
            )

        with patch("urllib.request.urlopen", side_effect=mock_urlopen):
            with patch("time.sleep"):
                papers = search_semantic_scholar("test query", limit=5)

        assert papers == []
        assert call_count == _MAX_RETRIES


class TestDegradationChain:
    def test_search_degrades_to_cache_on_api_failure(self, tmp_path: Path) -> None:
        from researchclaw.literature.cache import put_cache
        from researchclaw.literature.search import search_papers

        cached = [
            {
                "paper_id": "cached-1",
                "title": "Cached Paper",
                "authors": [],
                "year": 2024,
                "abstract": "cached",
                "venue": "",
                "citation_count": 5,
                "doi": "",
                "arxiv_id": "",
                "url": "",
                "source": "semantic_scholar",
            }
        ]
        put_cache(
            "test degradation", "semantic_scholar", 20, cached, cache_base=tmp_path
        )

        with patch(
            "researchclaw.literature.search.search_semantic_scholar",
            side_effect=RuntimeError("API down"),
        ):
            with patch(
                "researchclaw.literature.search.search_arxiv",
                side_effect=RuntimeError("API down"),
            ):
                with patch(
                    "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path
                ):
                    papers = search_papers("test degradation", limit=20)

        assert len(papers) >= 1
        assert any(p.title == "Cached Paper" for p in papers)

    def test_search_empty_on_total_failure(self, tmp_path: Path) -> None:
        from researchclaw.literature.search import search_papers

        with patch(
            "researchclaw.literature.search.search_openalex",
            side_effect=RuntimeError("API down"),
        ):
            with patch(
                "researchclaw.literature.search.search_semantic_scholar",
                side_effect=RuntimeError("API down"),
            ):
                with patch(
                    "researchclaw.literature.search.search_arxiv",
                    side_effect=RuntimeError("API down"),
                ):
                    with patch(
                        "researchclaw.literature.cache._DEFAULT_CACHE_DIR",
                        tmp_path / "empty-cache",
                    ):
                        papers = search_papers("no results query", limit=20)

        assert papers == []


class TestLLMFallback:
    def test_primary_403_forbidden_fallback_succeeds(self) -> None:
        from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse

        client = LLMClient(
            LLMConfig(
                base_url="https://api.example.com/v1",
                api_key="test-key",
                primary_model="gpt-blocked",
                fallback_models=["gpt-fallback"],
                max_retries=1,
            )
        )

        call_models: list[str] = []

        def mock_raw_call(model, messages, max_tokens, temperature, json_mode):
            _ = messages, max_tokens, temperature, json_mode
            call_models.append(model)
            if model == "gpt-blocked":
                raise urllib.error.HTTPError(
                    "url", 403, "not allowed to use model", Message(), None
                )
            return LLMResponse(content="ok", model=model)

        with patch.object(client, "_raw_call", side_effect=mock_raw_call):
            resp = client.chat([{"role": "user", "content": "test"}])

        assert resp.content == "ok"
        assert "gpt-blocked" in call_models
        assert "gpt-fallback" in call_models

    def test_preflight_detects_401(self) -> None:
        from researchclaw.llm.client import LLMClient, LLMConfig

        client = LLMClient(
            LLMConfig(
                base_url="https://api.example.com/v1",
                api_key="bad-key",
                primary_model="gpt-test",
                fallback_models=[],
                max_retries=1,
            )
        )

        if not hasattr(client, "preflight"):
            pytest.skip("preflight() not yet implemented")

        err = urllib.error.HTTPError("url", 401, "Unauthorized", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()

        assert ok is False
        assert "Invalid API key" in msg


class TestNoncriticalStageSkip:
    @staticmethod
    def _make_rc_config(tmp_path: Path):
        from researchclaw.config import RCConfig

        data = {
            "project": {"name": "rc-e2e-regression", "mode": "docs-first"},
            "research": {"topic": "pipeline regression"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local"},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline",
            },
        }
        return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

    def test_noncritical_stage_failure_is_skipped(self, tmp_path: Path) -> None:
        from researchclaw.adapters import AdapterBundle
        from researchclaw.pipeline import runner as rc_runner
        from researchclaw.pipeline.executor import StageResult
        from researchclaw.pipeline.stages import STAGE_SEQUENCE, Stage, StageStatus

        run_dir = tmp_path / "run"
        run_dir.mkdir()
        config = self._make_rc_config(tmp_path)
        adapters = AdapterBundle()

        def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
            _ = kwargs
            if stage is Stage.KNOWLEDGE_ARCHIVE:
                return StageResult(
                    stage=stage,
                    status=StageStatus.FAILED,
                    artifacts=(),
                    error="archive error",
                )
            return StageResult(
                stage=stage, status=StageStatus.DONE, artifacts=("ok.md",)
            )

        with patch.object(rc_runner, "execute_stage", side_effect=mock_execute_stage):
            results = rc_runner.execute_pipeline(
                run_dir=run_dir,
                run_id="run-skip-noncritical",
                config=config,
                adapters=adapters,
                skip_noncritical=True,
            )

        assert len(results) == len(STAGE_SEQUENCE)
        assert results[-1].stage is Stage.CITATION_VERIFY
        assert any(
            r.stage is Stage.KNOWLEDGE_ARCHIVE and r.status is StageStatus.FAILED
            for r in results
        )

    def test_critical_stage_failure_still_aborts(self, tmp_path: Path) -> None:
        from researchclaw.adapters import AdapterBundle
        from researchclaw.pipeline import runner as rc_runner
        from researchclaw.pipeline.executor import StageResult
        from researchclaw.pipeline.stages import Stage, StageStatus

        run_dir = tmp_path / "run-critical"
        run_dir.mkdir()
        config = self._make_rc_config(tmp_path)
        adapters = AdapterBundle()

        def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
            _ = kwargs
            if stage is Stage.PAPER_DRAFT:
                return StageResult(
                    stage=stage,
                    status=StageStatus.FAILED,
                    artifacts=(),
                    error="draft error",
                )
            return StageResult(
                stage=stage, status=StageStatus.DONE, artifacts=("ok.md",)
            )

        with patch.object(rc_runner, "execute_stage", side_effect=mock_execute_stage):
            results = rc_runner.execute_pipeline(
                run_dir=run_dir,
                run_id="run-fail-critical",
                config=config,
                adapters=adapters,
                skip_noncritical=True,
            )

        assert results[-1].stage is Stage.PAPER_DRAFT
        assert results[-1].status is StageStatus.FAILED


================================================
FILE: tests/test_rc_evolution.py
================================================
# pyright: reportPrivateUsage=false
"""Tests for the evolution (self-learning) system."""

from __future__ import annotations

import json
from datetime import datetime, timezone, timedelta
from pathlib import Path

import pytest

from researchclaw.evolution import (
    EvolutionStore,
    LessonCategory,
    LessonEntry,
    extract_lessons,
    _classify_error,
    _time_weight,
)


# ── LessonEntry tests ──


class TestLessonEntry:
    def test_to_dict_and_from_dict_roundtrip(self) -> None:
        entry = LessonEntry(
            stage_name="hypothesis_gen",
            stage_num=8,
            category="experiment",
            severity="error",
            description="Code validation failed",
            timestamp="2026-03-10T12:00:00+00:00",
            run_id="run-1",
        )
        data = entry.to_dict()
        restored = LessonEntry.from_dict(data)
        assert restored.stage_name == "hypothesis_gen"
        assert restored.stage_num == 8
        assert restored.category == "experiment"
        assert restored.severity == "error"

    def test_from_dict_handles_missing_fields(self) -> None:
        entry = LessonEntry.from_dict({})
        assert entry.stage_name == ""
        assert entry.stage_num == 0
        assert entry.category == "pipeline"


# ── Classification tests ──


class TestClassifyError:
    def test_timeout_classified_as_system(self) -> None:
        assert _classify_error("experiment_run", "Connection timeout after 30s") == "system"

    def test_validation_classified_as_experiment(self) -> None:
        assert _classify_error("code_generation", "Syntax error in code") == "experiment"

    def test_citation_classified_as_literature(self) -> None:
        assert _classify_error("citation_verify", "Hallucinated reference") == "literature"

    def test_paper_classified_as_writing(self) -> None:
        assert _classify_error("paper_draft", "Draft quality too low") == "writing"

    def test_unknown_defaults_to_pipeline(self) -> None:
        assert _classify_error("unknown_stage", "something random") == "pipeline"


# ── Time weight tests ──


class TestTimeWeight:
    def test_recent_lesson_has_high_weight(self) -> None:
        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        assert _time_weight(now) > 0.9

    def test_30_day_old_has_half_weight(self) -> None:
        ts = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat(timespec="seconds")
        weight = _time_weight(ts)
        assert 0.4 < weight < 0.6  # Should be ~0.5

    def test_90_day_old_returns_zero(self) -> None:
        ts = (datetime.now(timezone.utc) - timedelta(days=91)).isoformat(timespec="seconds")
        assert _time_weight(ts) == 0.0

    def test_invalid_timestamp_returns_zero(self) -> None:
        assert _time_weight("not-a-date") == 0.0

    def test_empty_timestamp_returns_zero(self) -> None:
        assert _time_weight("") == 0.0


# ── Extract lessons tests ──


class TestExtractLessons:
    def _make_result(self, stage_num, status, error=None, decision="proceed"):
        from types import SimpleNamespace
        from researchclaw.pipeline.stages import Stage, StageStatus

        stage = Stage(stage_num)
        return SimpleNamespace(
            stage=stage,
            status=StageStatus(status),
            error=error,
            decision=decision,
        )

    def test_extracts_lesson_from_failed_stage(self) -> None:
        results = [self._make_result(4, "failed", error="API rate limited")]
        lessons = extract_lessons(results, run_id="test-run")
        assert len(lessons) == 1
        assert lessons[0].severity == "error"
        assert "rate limited" in lessons[0].description

    def test_extracts_lesson_from_blocked_stage(self) -> None:
        results = [self._make_result(5, "blocked_approval")]
        lessons = extract_lessons(results, run_id="test-run")
        assert len(lessons) == 1
        assert lessons[0].severity == "warning"
        assert "blocked" in lessons[0].description

    def test_extracts_lesson_from_pivot_decision(self) -> None:
        results = [self._make_result(15, "done", decision="pivot")]
        lessons = extract_lessons(results, run_id="test-run")
        assert len(lessons) == 1
        assert "PIVOT" in lessons[0].description

    def test_no_lessons_from_successful_proceed(self) -> None:
        results = [self._make_result(1, "done", decision="proceed")]
        lessons = extract_lessons(results)
        assert len(lessons) == 0

    def test_multiple_results_multiple_lessons(self) -> None:
        results = [
            self._make_result(4, "failed", error="timeout"),
            self._make_result(5, "blocked_approval"),
            self._make_result(15, "done", decision="refine"),
        ]
        lessons = extract_lessons(results)
        assert len(lessons) == 3

    def test_extracts_decision_rationale(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        stage_dir = run_dir / "stage-15"
        stage_dir.mkdir(parents=True)
        (stage_dir / "decision_structured.json").write_text(
            json.dumps({"decision": "pivot", "rationale": "NaN in metrics"}),
            encoding="utf-8",
        )
        results = [self._make_result(15, "done", decision="pivot")]
        lessons = extract_lessons(results, run_id="test", run_dir=run_dir)
        assert any("NaN in metrics" in l.description for l in lessons)

    def test_extracts_rationale_from_raw_text_excerpt(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        stage_dir = run_dir / "stage-15"
        stage_dir.mkdir(parents=True)
        (stage_dir / "decision_structured.json").write_text(
            json.dumps({
                "decision": "refine",
                "raw_text_excerpt": (
                    "## Decision\n**REFINE**\n\n"
                    "## Justification\n"
                    "The analysis provides promising evidence but lacks statistical rigor."
                ),
                "generated": "2026-03-11T05:15:43+00:00",
            }),
            encoding="utf-8",
        )
        results = [self._make_result(15, "done", decision="refine")]
        lessons = extract_lessons(results, run_id="test", run_dir=run_dir)
        assert any("statistical rigor" in l.description for l in lessons)

    def test_extracts_stderr_runtime_lesson(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "metrics": {"loss": 0.5},
                "stderr": "RuntimeWarning: invalid value encountered in divide",
            }),
            encoding="utf-8",
        )
        results = [self._make_result(12, "done")]
        lessons = extract_lessons(results, run_dir=run_dir)
        assert any("RuntimeWarning" in l.description for l in lessons)

    def test_extracts_nan_metric_lesson(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"metrics": {"accuracy": "nan"}}),
            encoding="utf-8",
        )
        results = [self._make_result(12, "done")]
        lessons = extract_lessons(results, run_dir=run_dir)
        assert any("accuracy" in l.description and "nan" in l.description.lower()
                    for l in lessons)

    def test_no_runtime_lessons_without_run_dir(self) -> None:
        results = [self._make_result(12, "done")]
        lessons = extract_lessons(results)
        assert len(lessons) == 0


# ── EvolutionStore tests ──


class TestEvolutionStore:
    def test_append_and_load(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        lesson = LessonEntry(
            stage_name="hypothesis_gen",
            stage_num=8,
            category="pipeline",
            severity="warning",
            description="PIVOT triggered",
            timestamp=datetime.now(timezone.utc).isoformat(timespec="seconds"),
        )
        store.append(lesson)
        loaded = store.load_all()
        assert len(loaded) == 1
        assert loaded[0].stage_name == "hypothesis_gen"

    def test_append_many(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        lessons = [
            LessonEntry("s1", 1, "system", "error", "err1",
                        datetime.now(timezone.utc).isoformat()),
            LessonEntry("s2", 2, "pipeline", "info", "info1",
                        datetime.now(timezone.utc).isoformat()),
        ]
        store.append_many(lessons)
        assert store.count() == 2

    def test_append_many_empty_is_noop(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        store.append_many([])
        assert store.count() == 0

    def test_load_all_empty_store(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        assert store.load_all() == []

    def test_query_for_stage_returns_relevant_lessons(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        store.append(LessonEntry("hypothesis_gen", 8, "pipeline", "error",
                                 "Failed hypothesis", now))
        store.append(LessonEntry("paper_draft", 17, "writing", "warning",
                                 "Draft too short", now))
        result = store.query_for_stage("hypothesis_gen", max_lessons=5)
        # hypothesis_gen lesson should be boosted
        assert len(result) >= 1
        assert result[0].stage_name == "hypothesis_gen"

    def test_query_respects_max_lessons(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        for i in range(10):
            store.append(LessonEntry("stage_1", 1, "system", "error",
                                     f"Error {i}", now))
        result = store.query_for_stage("stage_1", max_lessons=3)
        assert len(result) == 3

    def test_build_overlay_returns_empty_for_no_lessons(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        assert store.build_overlay("hypothesis_gen") == ""

    def test_build_overlay_returns_formatted_text(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        now = datetime.now(timezone.utc).isoformat(timespec="seconds")
        store.append(LessonEntry("hypothesis_gen", 8, "experiment", "error",
                                 "Code syntax error in experiment", now))
        overlay = store.build_overlay("hypothesis_gen")
        assert "Lessons from Prior Runs" in overlay
        assert "Code syntax error" in overlay
        assert "❌" in overlay

    def test_old_lessons_filtered_by_time_weight(self, tmp_path: Path) -> None:
        store = EvolutionStore(tmp_path / "evo")
        old_ts = (datetime.now(timezone.utc) - timedelta(days=100)).isoformat()
        store.append(LessonEntry("stage_1", 1, "system", "error", "Old error", old_ts))
        result = store.query_for_stage("stage_1")
        assert len(result) == 0  # Filtered out due to age > 90 days

    def test_creates_directory_if_not_exists(self, tmp_path: Path) -> None:
        store_dir = tmp_path / "nested" / "evo"
        store = EvolutionStore(store_dir)
        assert store_dir.exists()


# ── PromptManager evolution overlay integration ──


class TestPromptManagerEvolutionOverlay:
    def test_overlay_appended_to_user_prompt(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        overlay = "## Lessons\n1. Avoid timeout errors."
        sp = pm.for_stage(
            "topic_init",
            evolution_overlay=overlay,
            topic="test",
            domains="ml",
            project_name="p1",
            quality_threshold="8.0",
        )
        assert "Avoid timeout errors" in sp.user

    def test_no_overlay_when_empty(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        sp1 = pm.for_stage(
            "topic_init",
            topic="test",
            domains="ml",
            project_name="p1",
            quality_threshold="8.0",
        )
        sp2 = pm.for_stage(
            "topic_init",
            evolution_overlay="",
            topic="test",
            domains="ml",
            project_name="p1",
            quality_threshold="8.0",
        )
        assert sp1.user == sp2.user


================================================
FILE: tests/test_rc_executor.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false
from __future__ import annotations

import json
import re
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast

import pytest

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.pipeline import executor as rc_executor
from researchclaw.pipeline.stages import Stage, StageStatus


class FakeLLMClient:
    def __init__(self, response_text: str = "mock response"):
        self.response_text: str = response_text
        self.calls: list[list[dict[str, str]]] = []

    def chat(self, messages: list[dict[str, str]], **kwargs: object):
        _ = kwargs
        self.calls.append(messages)
        from researchclaw.llm.client import LLMResponse

        return LLMResponse(content=self.response_text, model="fake-model")


class FakeLLMClientWithConfig(FakeLLMClient):
    def __init__(self, response_text: str = "mock response"):
        super().__init__(response_text=response_text)
        self.config: SimpleNamespace = SimpleNamespace(
            base_url="http://fake", api_key="fake-key"
        )


@pytest.fixture()
def rc_config(tmp_path: Path) -> RCConfig:
    data = {
        "project": {"name": "rc-test", "mode": "docs-first"},
        "research": {
            "topic": "test-driven science",
            "domains": ["ml", "systems"],
            "daily_paper_count": 2,
            "quality_threshold": 8.2,
        },
        "runtime": {"timezone": "UTC"},
        "notifications": {
            "channel": "local",
            "on_stage_start": True,
            "on_stage_fail": False,
            "on_gate_required": True,
        },
        "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
        "openclaw_bridge": {"use_memory": True, "use_message": True},
        "llm": {
            "provider": "openai-compatible",
            "base_url": "http://localhost:1234/v1",
            "api_key_env": "RC_TEST_KEY",
            "api_key": "inline-test-key",
            "primary_model": "fake-model",
            "fallback_models": [],
        },
        "security": {"hitl_required_stages": [5, 9, 20]},
        "experiment": {"mode": "sandbox"},
    }
    return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)


@pytest.fixture()
def adapters() -> AdapterBundle:
    return AdapterBundle()


@pytest.fixture()
def run_dir(tmp_path: Path) -> Path:
    path = tmp_path / "run"
    path.mkdir()
    return path


def _write_prior_artifact(
    run_dir: Path, stage_num: int, filename: str, content: str
) -> None:
    stage_dir = run_dir / f"stage-{stage_num:02d}"
    stage_dir.mkdir(parents=True, exist_ok=True)
    (stage_dir / filename).write_text(content, encoding="utf-8")


def test_executor_map_has_23_entries() -> None:
    executor_map = getattr(rc_executor, "EXECUTOR_MAP", rc_executor._STAGE_EXECUTORS)
    assert len(executor_map) == 23


def test_every_stage_member_has_matching_executor() -> None:
    executor_map = getattr(rc_executor, "EXECUTOR_MAP", rc_executor._STAGE_EXECUTORS)
    assert set(executor_map.keys()) == set(Stage)


def test_stage_result_dataclass_fields() -> None:
    result = rc_executor.StageResult(
        stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md",)
    )
    assert result.stage == Stage.TOPIC_INIT
    assert result.status == StageStatus.DONE
    assert result.artifacts == ("goal.md",)
    assert result.error is None
    assert result.decision == "proceed"
    assert result.evidence_refs == ()


def test_utcnow_iso_returns_valid_iso_timestamp() -> None:
    ts = rc_executor._utcnow_iso()
    assert ts.endswith("+00:00")
    assert "T" in ts


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("before\n```yaml\na: 1\n```\nafter", "a: 1"),
        ("```yml\nkey: value\n```", "key: value"),
        ("```\nplain: true\n```", "plain: true"),
        ("  x: y  ", "x: y"),
    ],
)
def test_extract_yaml_block_variants(text: str, expected: str) -> None:
    assert rc_executor._extract_yaml_block(text) == expected


@pytest.mark.parametrize(
    ("payload", "default", "expected"),
    [
        ('{"ok": true}', {"fallback": True}, {"ok": True}),
        ("[1, 2, 3]", {"fallback": True}, [1, 2, 3]),
        ("not-json", {"fallback": True}, {"fallback": True}),
    ],
)
def test_safe_json_loads_valid_and_invalid(payload: str, default, expected) -> None:
    assert rc_executor._safe_json_loads(payload, default) == expected


@pytest.mark.parametrize(
    ("raw", "expected"),
    [
        ("a/b", "a_b"),
        ("a\\b", "a_b"),
        ("../secret", "__secret"),
        ("name with spaces!.md", "name_with_spaces_.md"),
        ("", "unnamed"),
    ],
)
def test_safe_filename_sanitization(raw: str, expected: str) -> None:
    assert rc_executor._safe_filename(raw) == expected


def test_safe_filename_truncates_to_100_chars() -> None:
    raw = "x" * 120
    cleaned = rc_executor._safe_filename(raw)
    assert len(cleaned) == 100
    assert cleaned == "x" * 100


def test_build_context_preamble_basic_fields(
    rc_config: RCConfig, run_dir: Path
) -> None:
    text = rc_executor._build_context_preamble(rc_config, run_dir)
    assert "## Research Context" in text
    assert "test-driven science" in text
    assert "ml, systems" in text


def test_build_context_preamble_includes_selected_prior_artifacts(
    rc_config: RCConfig, run_dir: Path
) -> None:
    _write_prior_artifact(run_dir, 1, "goal.md", "goal content")
    _write_prior_artifact(run_dir, 8, "hypotheses.md", "hyp content")
    _write_prior_artifact(run_dir, 7, "synthesis.md", "synth content")
    text = rc_executor._build_context_preamble(
        rc_config,
        run_dir,
        include_goal=True,
        include_hypotheses=True,
        include_synthesis=True,
    )
    assert "### Goal" in text
    assert "goal content" in text
    assert "### Hypotheses" in text
    assert "hyp content" in text
    assert "### Synthesis" in text
    assert "synth content" in text


def test_read_prior_artifact_finds_newest_file(run_dir: Path) -> None:
    _write_prior_artifact(run_dir, 1, "goal.md", "old")
    _write_prior_artifact(run_dir, 3, "goal.md", "new")
    found = rc_executor._read_prior_artifact(run_dir, "goal.md")
    assert found == "new"


def test_read_prior_artifact_finds_directory_path(run_dir: Path) -> None:
    cards_dir = run_dir / "stage-06" / "cards"
    cards_dir.mkdir(parents=True)
    (cards_dir / "card-1.json").write_text("{}", encoding="utf-8")
    found = rc_executor._read_prior_artifact(run_dir, "cards/")
    assert found == str(cards_dir)


def test_read_prior_artifact_returns_none_when_not_found(run_dir: Path) -> None:
    assert rc_executor._read_prior_artifact(run_dir, "missing.md") is None


def test_read_best_analysis_prefers_best_file(run_dir: Path) -> None:
    """BUG-225: _read_best_analysis prefers analysis_best.md at run root."""
    from researchclaw.pipeline._helpers import _read_best_analysis

    # Create degenerate analysis in stage-14 and best at run root
    s14 = run_dir / "stage-14"
    s14.mkdir(parents=True)
    (s14 / "analysis.md").write_text("Degenerate analysis", encoding="utf-8")
    (run_dir / "analysis_best.md").write_text("Best analysis", encoding="utf-8")

    result = _read_best_analysis(run_dir)
    assert result == "Best analysis"


def test_read_best_analysis_falls_back_to_prior_artifact(run_dir: Path) -> None:
    """BUG-225: Falls back to _read_prior_artifact when no analysis_best.md."""
    from researchclaw.pipeline._helpers import _read_best_analysis

    s14 = run_dir / "stage-14"
    s14.mkdir(parents=True)
    (s14 / "analysis.md").write_text("Only analysis", encoding="utf-8")

    result = _read_best_analysis(run_dir)
    assert result == "Only analysis"


def test_read_best_analysis_returns_empty_when_none(run_dir: Path) -> None:
    """BUG-225: Returns empty string when no analysis exists at all."""
    from researchclaw.pipeline._helpers import _read_best_analysis

    result = _read_best_analysis(run_dir)
    assert result == ""


def test_write_stage_meta_writes_expected_json(run_dir: Path) -> None:
    stage_dir = run_dir / "stage-01"
    stage_dir.mkdir()
    result = rc_executor.StageResult(
        stage=Stage.TOPIC_INIT,
        status=StageStatus.DONE,
        artifacts=("goal.md",),
        decision="proceed",
        evidence_refs=("stage-01/goal.md",),
    )
    rc_executor._write_stage_meta(stage_dir, Stage.TOPIC_INIT, "run-abc", result)
    payload = cast(
        dict[str, Any],
        json.loads((stage_dir / "decision.json").read_text(encoding="utf-8")),
    )
    assert payload["stage_id"] == "01-topic_init"
    assert payload["run_id"] == "run-abc"
    assert payload["status"] == "done"
    assert payload["decision"] == "proceed"
    assert payload["output_artifacts"] == ["goal.md"]
    assert payload["evidence_refs"] == ["stage-01/goal.md"]
    assert payload["next_stage"] == 2
    assert re.match(r"\d{4}-\d{2}-\d{2}T", payload["ts"])


def test_execute_stage_creates_stage_dir_writes_artifacts_and_meta(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    fake_llm = FakeLLMClientWithConfig("# Goal\n\nMocked goal body")
    monkeypatch.setattr(
        "researchclaw.pipeline.executor.LLMClient.from_rc_config",
        lambda _config: fake_llm,
    )

    result = rc_executor.execute_stage(
        Stage.TOPIC_INIT,
        run_dir=run_dir,
        run_id="run-1",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )

    assert result.status == StageStatus.DONE
    assert "goal.md" in result.artifacts
    assert "hardware_profile.json" in result.artifacts
    assert (run_dir / "stage-01").is_dir()
    assert (
        (run_dir / "stage-01" / "goal.md")
        .read_text(encoding="utf-8")
        .startswith("# Goal")
    )
    assert (run_dir / "stage-01" / "hardware_profile.json").exists()
    assert len(fake_llm.calls) == 1

    decision = cast(
        dict[str, Any],
        json.loads(
            (run_dir / "stage-01" / "decision.json").read_text(encoding="utf-8")
        ),
    )
    assert decision["run_id"] == "run-1"
    assert decision["status"] == "done"
    assert decision["output_artifacts"] == ["goal.md", "hardware_profile.json"]


def test_execute_stage_contract_validation_missing_output_file_marks_failed(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    def bad_executor(
        _stage_dir: Path,
        _run_dir: Path,
        _config: RCConfig,
        _adapters: AdapterBundle,
        *,
        llm: object = None,
    ):
        _ = llm
        return rc_executor.StageResult(
            stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md",)
        )

    monkeypatch.setitem(rc_executor._STAGE_EXECUTORS, Stage.TOPIC_INIT, bad_executor)
    result = rc_executor.execute_stage(
        Stage.TOPIC_INIT,
        run_dir=run_dir,
        run_id="run-2",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.FAILED
    assert "Missing or empty output: goal.md" in (result.error or "")


def test_execute_stage_contract_validation_missing_output_directory_marks_failed(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    _write_prior_artifact(run_dir, 5, "shortlist.jsonl", '{"title": "x"}')

    def bad_executor(
        _stage_dir: Path,
        _run_dir: Path,
        _config: RCConfig,
        _adapters: AdapterBundle,
        *,
        llm: object = None,
    ):
        _ = llm
        return rc_executor.StageResult(
            stage=Stage.KNOWLEDGE_EXTRACT,
            status=StageStatus.DONE,
            artifacts=("cards/",),
        )

    monkeypatch.setitem(
        rc_executor._STAGE_EXECUTORS, Stage.KNOWLEDGE_EXTRACT, bad_executor
    )
    result = rc_executor.execute_stage(
        Stage.KNOWLEDGE_EXTRACT,
        run_dir=run_dir,
        run_id="run-3",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.FAILED
    assert "Missing output directory: cards/" in (result.error or "")


def test_execute_stage_missing_required_input_returns_failed(
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    result = rc_executor.execute_stage(
        Stage.PROBLEM_DECOMPOSE,
        run_dir=run_dir,
        run_id="run-4",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.FAILED
    assert "Missing input: goal.md" in (result.error or "")


def test_execute_stage_gate_behavior_auto_approve_true_keeps_done(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    _write_prior_artifact(run_dir, 4, "candidates.jsonl", '{"title": "paper"}')

    def good_executor(
        stage_dir: Path,
        _run_dir: Path,
        _config: RCConfig,
        _adapters: AdapterBundle,
        *,
        llm: object = None,
        **_kwargs: object,
    ):
        _ = llm
        (stage_dir / "shortlist.jsonl").write_text(
            '{"title": "paper"}\n', encoding="utf-8"
        )
        return rc_executor.StageResult(
            stage=Stage.LITERATURE_SCREEN,
            status=StageStatus.DONE,
            artifacts=("shortlist.jsonl",),
        )

    monkeypatch.setitem(
        rc_executor._STAGE_EXECUTORS, Stage.LITERATURE_SCREEN, good_executor
    )
    result = rc_executor.execute_stage(
        Stage.LITERATURE_SCREEN,
        run_dir=run_dir,
        run_id="run-5",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.DONE
    memory_entries = getattr(adapters.memory, "entries", [])
    assert any(
        ns == "gates" and "auto-approved" in content for ns, content in memory_entries
    )


def test_execute_stage_gate_behavior_auto_approve_false_blocks(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    _write_prior_artifact(run_dir, 4, "candidates.jsonl", '{"title": "paper"}')

    def good_executor(
        stage_dir: Path,
        _run_dir: Path,
        _config: RCConfig,
        _adapters: AdapterBundle,
        *,
        llm: object = None,
        **_kwargs: object,
    ):
        _ = llm
        (stage_dir / "shortlist.jsonl").write_text(
            '{"title": "paper"}\n', encoding="utf-8"
        )
        return rc_executor.StageResult(
            stage=Stage.LITERATURE_SCREEN,
            status=StageStatus.DONE,
            artifacts=("shortlist.jsonl",),
        )

    monkeypatch.setitem(
        rc_executor._STAGE_EXECUTORS, Stage.LITERATURE_SCREEN, good_executor
    )
    result = rc_executor.execute_stage(
        Stage.LITERATURE_SCREEN,
        run_dir=run_dir,
        run_id="run-6",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=False,
    )
    assert result.status == StageStatus.BLOCKED_APPROVAL
    assert result.decision == "block"
    message_calls = getattr(adapters.message, "calls", [])
    assert message_calls
    assert "Approval required" in message_calls[-1][2]


def test_execute_stage_llm_client_creation_error_falls_back_without_crash(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    def boom(_config: RCConfig):
        raise RuntimeError("llm init failed")

    monkeypatch.setattr("researchclaw.pipeline.executor.LLMClient.from_rc_config", boom)
    result = rc_executor.execute_stage(
        Stage.TOPIC_INIT,
        run_dir=run_dir,
        run_id="run-7",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.DONE
    assert (run_dir / "stage-01" / "goal.md").exists()


def test_execute_stage_executor_exception_returns_failed(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    def raising_executor(
        _stage_dir: Path,
        _run_dir: Path,
        _config: RCConfig,
        _adapters: AdapterBundle,
        *,
        llm: object = None,
        **_kwargs: object,
    ):
        _ = llm
        raise RuntimeError("stage exploded")

    monkeypatch.setitem(
        rc_executor._STAGE_EXECUTORS, Stage.TOPIC_INIT, raising_executor
    )
    result = rc_executor.execute_stage(
        Stage.TOPIC_INIT,
        run_dir=run_dir,
        run_id="run-8",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert result.status == StageStatus.FAILED
    assert result.decision == "retry"
    assert "stage exploded" in (result.error or "")


@pytest.mark.parametrize(
    "stage",
    [
        Stage.TOPIC_INIT,
        Stage.PROBLEM_DECOMPOSE,
        Stage.SEARCH_STRATEGY,
        Stage.LITERATURE_COLLECT,
        Stage.LITERATURE_SCREEN,
        Stage.KNOWLEDGE_EXTRACT,
        Stage.SYNTHESIS,
        Stage.HYPOTHESIS_GEN,
        Stage.EXPERIMENT_DESIGN,
        Stage.CODE_GENERATION,
    ],
)
def test_stage_executor_mapping_values_are_callable(stage: Stage) -> None:
    assert callable(rc_executor._STAGE_EXECUTORS[stage])


class TestStageHealth:
    def test_stage_health_json_written(self, tmp_path: Path) -> None:
        from researchclaw.pipeline.executor import execute_stage
        from researchclaw.pipeline.stages import Stage

        config = RCConfig.load(
            Path(__file__).parent.parent / "config.researchclaw.example.yaml",
            check_paths=False,
        )
        result = execute_stage(
            Stage.TOPIC_INIT,
            run_dir=tmp_path,
            run_id="test-health",
            config=config,
            adapters=AdapterBundle(),
            auto_approve_gates=True,
        )
        health_path = tmp_path / "stage-01" / "stage_health.json"
        assert result is not None
        assert health_path.exists()

    def test_stage_health_has_required_fields(self, tmp_path: Path) -> None:
        from unittest.mock import MagicMock, patch

        from researchclaw.pipeline.executor import execute_stage
        from researchclaw.pipeline.stages import Stage

        config = RCConfig.load(
            Path(__file__).parent.parent / "config.researchclaw.example.yaml",
            check_paths=False,
        )

        with patch("researchclaw.pipeline.executor.LLMClient") as mock_llm_cls:
            mock_client = MagicMock()
            mock_client.chat.return_value = MagicMock(
                content='{"topic": "test", "research_questions": ["q1"]}'
            )
            mock_llm_cls.from_rc_config.return_value = mock_client

            execute_stage(
                Stage.TOPIC_INIT,
                run_dir=tmp_path,
                run_id="test-health-fields",
                config=config,
                adapters=AdapterBundle(),
                auto_approve_gates=True,
            )

        health_path = tmp_path / "stage-01" / "stage_health.json"
        if health_path.exists():
            data = json.loads(health_path.read_text(encoding="utf-8"))
            assert "stage_id" in data
            assert "run_id" in data
            assert "duration_sec" in data
            assert "status" in data
            assert "timestamp" in data
            assert data["duration_sec"] >= 0


    def test_stage_health_duration_positive(self, tmp_path: Path) -> None:
        from unittest.mock import MagicMock, patch

        from researchclaw.pipeline.executor import execute_stage
        from researchclaw.pipeline.stages import Stage

        config = RCConfig.load(
            Path(__file__).parent.parent / "config.researchclaw.example.yaml",
            check_paths=False,
        )

        with patch("researchclaw.pipeline.executor.LLMClient") as mock_llm_cls:
            mock_client = MagicMock()
            mock_client.chat.return_value = MagicMock(
                content='{"topic": "test", "sub_problems": []}'
            )
            mock_llm_cls.from_rc_config.return_value = mock_client

            execute_stage(
                Stage.TOPIC_INIT,
                run_dir=tmp_path,
                run_id="test-duration",
                config=config,
                adapters=AdapterBundle(),
                auto_approve_gates=True,
            )

        health_path = tmp_path / "stage-01" / "stage_health.json"
        if health_path.exists():
            data = json.loads(health_path.read_text(encoding="utf-8"))
            assert data["duration_sec"] >= 0

# Contracts import for Stage 13/22 preservation features.
from researchclaw.pipeline.contracts import CONTRACTS


class TestIterativeRefine:
    def _prepare_refine_inputs(self, run_dir: Path) -> None:
        _write_prior_artifact(
            run_dir,
            10,
            "experiment.py",
            (
                "import random\n"
                "random.seed(42)\n"
                "for i in range(5):\n"
                "    print(f'val_loss: {0.5 - i*0.05:.4f}')\n"
            ),
        )
        (run_dir / "stage-12" / "runs").mkdir(parents=True, exist_ok=True)
        _write_prior_artifact(
            run_dir,
            12,
            "runs/run-1.json",
            json.dumps(
                {
                    "run_id": "run-1",
                    "status": "completed",
                    "metrics": {"val_loss": 0.35},
                }
            ),
        )

    def test_refine_simulated_mode_skips(
        self,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        """R10-Fix3: Simulated mode should skip iterative refinement entirely."""
        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)
        # Force simulated mode to test the skip behavior
        import copy
        sim_cfg = copy.deepcopy(rc_config)
        object.__setattr__(sim_cfg.experiment, "mode", "simulated")

        result = rc_executor._execute_iterative_refine(
            stage_dir,
            run_dir,
            sim_cfg,
            adapters,
            llm=None,
        )

        payload = json.loads(
            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
        )
        assert payload["skipped"] is True
        assert payload["mode"] == "simulated"
        assert result.status == StageStatus.DONE
        # Original code should be copied as final
        assert (stage_dir / "experiment_final.py").exists()

    def test_refine_no_llm_saves_original_as_final(
        self,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        result = rc_executor._execute_iterative_refine(
            stage_dir,
            run_dir,
            rc_config,
            adapters,
            llm=None,
        )

        original_code = (run_dir / "stage-10" / "experiment.py").read_text(
            encoding="utf-8"
        )
        final_code = (stage_dir / "experiment_final.py").read_text(encoding="utf-8")
        assert original_code == final_code
        payload = json.loads(
            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
        )
        assert payload["stop_reason"] == "llm_unavailable"
        assert result.status == StageStatus.DONE

    def test_refine_with_llm_generates_improved_code(
        self,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)
        llm = FakeLLMClient(
            "```python\n"
            "import random\n"
            "random.seed(42)\n"
            "for i in range(10):\n"
            "    print(f'val_loss: {0.4 - i*0.03:.4f}')\n"
            "```"
        )

        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, rc_config, adapters, llm=llm
        )

        assert (stage_dir / "experiment_v1").is_dir()
        assert (stage_dir / "experiment_final.py").exists()
        payload = json.loads(
            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
        )
        assert isinstance(payload.get("iterations"), list)
        assert payload["iterations"]

    def test_refine_converges_after_no_improvement(
        self,
        tmp_path: Path,
        run_dir: Path,
        adapters: AdapterBundle,
    ) -> None:
        import sys

        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        sandbox_data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {
                "topic": "test-driven science",
                "domains": ["ml", "systems"],
                "daily_paper_count": 2,
                "quality_threshold": 8.2,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline-test-key",
                "primary_model": "fake-model",
                "fallback_models": [],
            },
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 3,
                "metric_key": "val_loss",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 1024,
                },
            },
        }
        sandbox_config = RCConfig.from_dict(
            sandbox_data,
            project_root=tmp_path,
            check_paths=False,
        )
        llm = FakeLLMClient(
            "```python\nfor _ in range(3):\n    print('val_loss: 0.5000')\n```"
        )

        rc_executor._execute_iterative_refine(
            stage_dir,
            run_dir,
            sandbox_config,
            adapters,
            llm=llm,
        )

        payload = json.loads(
            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
        )
        assert payload["converged"] is True
        assert payload["stop_reason"] == "no_improvement_for_2_iterations"

    def test_refine_artifacts_include_version_files(
        self,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)
        llm = FakeLLMClient(
            "```python\n"
            "import random\n"
            "random.seed(42)\n"
            "for i in range(10):\n"
            "    print(f'val_loss: {0.4 - i*0.03:.4f}')\n"
            "```"
        )

        result = rc_executor._execute_iterative_refine(
            stage_dir,
            run_dir,
            rc_config,
            adapters,
            llm=llm,
        )

        assert "refinement_log.json" in result.artifacts
        assert "experiment_final/" in result.artifacts
        assert any(
            artifact.startswith("experiment_v") and artifact.endswith("/")
            for artifact in result.artifacts
        )

    def test_refine_sandbox_mode_runs_code(
        self,
        tmp_path: Path,
        run_dir: Path,
        adapters: AdapterBundle,
    ) -> None:
        import sys

        self._prepare_refine_inputs(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        sandbox_data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {
                "topic": "test-driven science",
                "domains": ["ml", "systems"],
                "daily_paper_count": 2,
                "quality_threshold": 8.2,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline-test-key",
                "primary_model": "fake-model",
                "fallback_models": [],
            },
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 3,
                "metric_key": "val_loss",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 1024,
                },
            },
        }
        sandbox_config = RCConfig.from_dict(
            sandbox_data,
            project_root=tmp_path,
            check_paths=False,
        )
        llm = FakeLLMClient(
            "```python\n"
            "import random\n"
            "random.seed(42)\n"
            "for i in range(10):\n"
            "    print(f'val_loss: {0.4 - i*0.03:.4f}')\n"
            "```"
        )

        rc_executor._execute_iterative_refine(
            stage_dir,
            run_dir,
            sandbox_config,
            adapters,
            llm=llm,
        )

        payload = json.loads(
            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
        )
        assert any(
            "sandbox" in iteration for iteration in payload.get("iterations", [])
        )


class TestExportPublishCodePackage:
    def test_export_packages_experiment_final(
        self,
        tmp_path: Path,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        _write_prior_artifact(
            run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..."
        )
        _write_prior_artifact(
            run_dir,
            13,
            "experiment_final.py",
            'import numpy\nprint("val_loss: 0.1")\n',
        )
        stage_dir = tmp_path / "run" / "stage-22"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_export_publish(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )

        assert (stage_dir / "code" / "experiment.py").exists()
        assert (stage_dir / "code" / "README.md").exists()
        req_text = (stage_dir / "code" / "requirements.txt").read_text(encoding="utf-8")
        assert "numpy" in req_text

    def test_export_falls_back_to_experiment_py(
        self,
        tmp_path: Path,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        _write_prior_artifact(
            run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..."
        )
        _write_prior_artifact(
            run_dir,
            10,
            "experiment.py",
            'import numpy\nprint("val_loss: 0.1")\n',
        )
        stage_dir = tmp_path / "run" / "stage-22"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_export_publish(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )

        code_text = (stage_dir / "code" / "experiment.py").read_text(encoding="utf-8")
        assert "val_loss: 0.1" in code_text

    def test_export_no_experiment_skips_code_dir(
        self,
        tmp_path: Path,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        _write_prior_artifact(
            run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..."
        )
        stage_dir = tmp_path / "run" / "stage-22"
        stage_dir.mkdir(parents=True, exist_ok=True)

        result = rc_executor._execute_export_publish(
            stage_dir,
            run_dir,
            rc_config,
            adapters,
            llm=None,
        )

        assert not (stage_dir / "code").exists()
        assert "code/" not in result.artifacts

    def test_export_detects_multiple_dependencies(
        self,
        tmp_path: Path,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        _write_prior_artifact(
            run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..."
        )
        _write_prior_artifact(
            run_dir,
            13,
            "experiment_final.py",
            (
                "import numpy\n"
                "import torch\n"
                "from sklearn.metrics import accuracy_score\n"
                "print(accuracy_score([1], [1]))\n"
            ),
        )
        stage_dir = tmp_path / "run" / "stage-22"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_export_publish(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )

        requirements = (stage_dir / "code" / "requirements.txt").read_text(
            encoding="utf-8"
        )
        assert "numpy" in requirements
        assert "torch" in requirements
        assert "scikit-learn" in requirements

    def test_export_code_readme_contains_title(
        self,
        tmp_path: Path,
        run_dir: Path,
        rc_config: RCConfig,
        adapters: AdapterBundle,
    ) -> None:
        _write_prior_artifact(
            run_dir, 19, "paper_revised.md", "# My Great Paper\n\nSome content..."
        )
        _write_prior_artifact(
            run_dir,
            13,
            "experiment_final.py",
            'print("val_loss: 0.1")\n',
        )
        stage_dir = tmp_path / "run" / "stage-22"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_export_publish(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )

        readme = (stage_dir / "code" / "README.md").read_text(encoding="utf-8")
        assert "My Great Paper" in readme


def test_contracts_stage13_includes_experiment_final() -> None:
    assert "experiment_final/" in CONTRACTS[Stage.ITERATIVE_REFINE].output_files


def test_contracts_stage22_includes_code_dir() -> None:
    assert "code/" in CONTRACTS[Stage.EXPORT_PUBLISH].output_files


# ── P1-1: Topic keyword extraction tests ──


class TestExtractTopicKeywords:
    def test_basic_extraction(self) -> None:
        keywords = rc_executor._extract_topic_keywords(
            "Agent-based Reinforcement Learning for Automated Scientific Discovery"
        )
        assert "agent-based" in keywords
        assert "reinforcement" in keywords
        assert "learning" in keywords
        assert "automated" in keywords
        assert "scientific" in keywords
        assert "discovery" in keywords
        # Stop words excluded
        # Stop words excluded
        assert "for" not in keywords

    def test_includes_domain_keywords(self) -> None:
        keywords = rc_executor._extract_topic_keywords(
            "Neural network pruning", domains=("ml", "optimization")
        )
        assert "neural" in keywords
        assert "network" in keywords
        assert "pruning" in keywords
        assert "ml" in keywords
        assert "optimization" in keywords

    def test_deduplication(self) -> None:
        keywords = rc_executor._extract_topic_keywords(
            "Learning to learn meta-learning", domains=("learning",)
        )
        assert keywords.count("learning") == 1

    def test_empty_topic(self) -> None:
        keywords = rc_executor._extract_topic_keywords("")
        assert keywords == []


# ── P1-2: Topic constraint block test ──


class TestTopicConstraintBlock:
    def test_contains_topic(self) -> None:
        block = rc_executor._topic_constraint_block("Transformer attention for time series")
        assert "Transformer attention for time series" in block

    def test_contains_prohibition(self) -> None:
        block = rc_executor._topic_constraint_block("anything")
        assert "PROHIBITED" in block
        assert "environment" in block.lower()
        assert "infrastructure" in block.lower()

    def test_hard_constraint_markers(self) -> None:
        block = rc_executor._topic_constraint_block("test")
        assert "HARD TOPIC CONSTRAINT" in block
        assert "END CONSTRAINT" in block


# ── Multi-perspective debate tests ──


class TestParseDecision:
    def test_proceed_default(self) -> None:
        assert rc_executor._parse_decision("Some random text") == "proceed"

    def test_proceed_explicit(self) -> None:
        text = "## Decision\nPROCEED\n## Justification\nGood results."
        assert rc_executor._parse_decision(text) == "proceed"

    def test_pivot_detected(self) -> None:
        text = "## Decision\nPIVOT\n## Justification\nHypotheses flawed."
        assert rc_executor._parse_decision(text) == "pivot"

    def test_refine_detected(self) -> None:
        text = "## Decision\nREFINE\n## Justification\nNeed more tuning."
        assert rc_executor._parse_decision(text) == "refine"

    def test_pivot_case_insensitive(self) -> None:
        text = "## Decision\npivot\n## Justification\nBad approach."
        assert rc_executor._parse_decision(text) == "pivot"

    def test_pivot_takes_priority_over_proceed(self) -> None:
        text = "## Decision\nPIVOT\nWe should not PROCEED."
        assert rc_executor._parse_decision(text) == "pivot"

    def test_decision_in_body_not_heading(self) -> None:
        text = "The results suggest we should PIVOT to a new approach."
        assert rc_executor._parse_decision(text) == "pivot"


class TestResearchDecisionStructured:
    def test_decision_produces_structured_json(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-15"
        stage_dir.mkdir(parents=True)
        _write_prior_artifact(run_dir, 14, "analysis.md", "# Analysis\nResults ok.")
        fake_llm = FakeLLMClient("## Decision\nPROCEED\n## Justification\nGood.")
        result = rc_executor._execute_research_decision(
            stage_dir, run_dir, rc_config, adapters, llm=fake_llm
        )
        assert result.decision == "proceed"
        assert "decision_structured.json" in result.artifacts
        import json
        data = json.loads((stage_dir / "decision_structured.json").read_text())
        assert data["decision"] == "proceed"

    def test_pivot_decision_from_llm(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-15"
        stage_dir.mkdir(parents=True)
        _write_prior_artifact(run_dir, 14, "analysis.md", "# Analysis\nBad results.")
        fake_llm = FakeLLMClient("## Decision\nPIVOT\n## Justification\nFlawed.")
        result = rc_executor._execute_research_decision(
            stage_dir, run_dir, rc_config, adapters, llm=fake_llm
        )
        assert result.decision == "pivot"

    def test_no_llm_defaults_to_proceed(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-15"
        stage_dir.mkdir(parents=True)
        result = rc_executor._execute_research_decision(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )
        assert result.decision == "proceed"


class TestMultiPerspectiveGenerate:
    def test_generates_all_perspectives(self, tmp_path: Path) -> None:
        roles = {
            "role_a": {"system": "You are A.", "user": "Do A for {topic}."},
            "role_b": {"system": "You are B.", "user": "Do B for {topic}."},
        }
        fake_llm = FakeLLMClient("perspective output")
        perspectives_dir = tmp_path / "perspectives"
        result = rc_executor._multi_perspective_generate(
            fake_llm, roles, {"topic": "test"}, perspectives_dir
        )
        assert set(result.keys()) == {"role_a", "role_b"}
        assert (perspectives_dir / "role_a.md").exists()
        assert (perspectives_dir / "role_b.md").exists()
        assert len(fake_llm.calls) == 2

    def test_saves_perspective_content(self, tmp_path: Path) -> None:
        roles = {"critic": {"system": "Be critical.", "user": "Criticize {topic}."}}
        fake_llm = FakeLLMClient("critical analysis here")
        perspectives_dir = tmp_path / "perspectives"
        rc_executor._multi_perspective_generate(
            fake_llm, roles, {"topic": "ml"}, perspectives_dir
        )
        content = (perspectives_dir / "critic.md").read_text()
        assert content == "critical analysis here"

    def test_renders_variables_in_prompts(self, tmp_path: Path) -> None:
        roles = {"r1": {"system": "Sys for {topic}.", "user": "User for {topic}."}}
        fake_llm = FakeLLMClient("ok")
        rc_executor._multi_perspective_generate(
            fake_llm, roles, {"topic": "RL"}, tmp_path / "p"
        )
        call = fake_llm.calls[0]
        assert "RL" in call[0]["content"]


class TestSynthesizePerspectives:
    def test_combines_perspectives(self) -> None:
        fake_llm = FakeLLMClient("synthesized result")
        pm = rc_executor.PromptManager()
        perspectives = {"innovator": "idea A", "contrarian": "idea B"}
        result = rc_executor._synthesize_perspectives(
            fake_llm, perspectives, "hypothesis_synthesize", pm
        )
        assert result == "synthesized result"
        # Check the user prompt contained both perspectives
        call_content = fake_llm.calls[0][0]["content"]
        assert "innovator" in call_content
        assert "contrarian" in call_content


class TestHypothesisGenDebate:
    def test_hypothesis_gen_with_llm_creates_perspectives(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-08"
        stage_dir.mkdir(parents=True)
        _write_prior_artifact(run_dir, 7, "synthesis.md", "# Synthesis\nGap found.")
        fake_llm = FakeLLMClient("## H1\nTest hypothesis")
        result = rc_executor._execute_hypothesis_gen(
            stage_dir, run_dir, rc_config, adapters, llm=fake_llm
        )
        assert result.status == StageStatus.DONE
        assert "hypotheses.md" in result.artifacts
        perspectives_dir = stage_dir / "perspectives"
        assert perspectives_dir.exists()
        # Should have 3 perspective files (innovator, pragmatist, contrarian)
        perspective_files = list(perspectives_dir.glob("*.md"))
        assert len(perspective_files) == 3

    def test_hypothesis_gen_without_llm_no_perspectives(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-08"
        stage_dir.mkdir(parents=True)
        _write_prior_artifact(run_dir, 7, "synthesis.md", "# Synthesis\nGap found.")
        result = rc_executor._execute_hypothesis_gen(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )
        assert result.status == StageStatus.DONE
        assert "hypotheses.md" in result.artifacts
        # No perspectives directory when no LLM
        assert not (stage_dir / "perspectives").exists()


class TestResultAnalysisDebate:
    def test_result_analysis_with_llm_creates_perspectives(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-14"
        stage_dir.mkdir(parents=True)
        _write_prior_artifact(run_dir, 1, "goal.md", "# Goal\nTest")
        _write_prior_artifact(run_dir, 8, "hypotheses.md", "# H1\nTest")
        fake_llm = FakeLLMClient("## Analysis\nResults look good.")
        result = rc_executor._execute_result_analysis(
            stage_dir, run_dir, rc_config, adapters, llm=fake_llm
        )
        assert result.status == StageStatus.DONE
        assert "analysis.md" in result.artifacts
        perspectives_dir = stage_dir / "perspectives"
        assert perspectives_dir.exists()
        # Should have 3 perspective files (optimist, skeptic, methodologist)
        perspective_files = list(perspectives_dir.glob("*.md"))
        assert len(perspective_files) == 3

    def test_result_analysis_without_llm_no_perspectives(
        self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-14"
        stage_dir.mkdir(parents=True)
        result = rc_executor._execute_result_analysis(
            stage_dir, run_dir, rc_config, adapters, llm=None
        )
        assert result.status == StageStatus.DONE
        assert "analysis.md" in result.artifacts
        assert not (stage_dir / "perspectives").exists()


class TestParseMetricsFromStdout:
    """Tests for _parse_metrics_from_stdout() helper."""

    def test_parses_simple_name_value(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = "loss: 0.0042\naccuracy: 0.95"
        metrics = _parse_metrics_from_stdout(stdout)
        assert metrics["loss"] == pytest.approx(0.0042)
        assert metrics["accuracy"] == pytest.approx(0.95)

    def test_parses_compound_names(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = "UCB (Stochastic) cumulative_regret: 361.9233\nEXP3 (Adversarial) total_rewards: 13368.4811"
        metrics = _parse_metrics_from_stdout(stdout)
        assert "UCB (Stochastic) cumulative_regret" in metrics
        assert metrics["UCB (Stochastic) cumulative_regret"] == pytest.approx(361.9233)

    def test_ignores_non_numeric_lines(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = "Running experiment...\nloss: 0.5\nDone."
        metrics = _parse_metrics_from_stdout(stdout)
        assert len(metrics) == 1
        assert metrics["loss"] == pytest.approx(0.5)

    def test_empty_stdout_returns_empty_dict(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        assert _parse_metrics_from_stdout("") == {}

    def test_handles_negative_values(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = "UCB (Adversarial) cumulative_regret: -3877.5323"
        metrics = _parse_metrics_from_stdout(stdout)
        assert metrics["UCB (Adversarial) cumulative_regret"] == pytest.approx(-3877.5323)

    def test_filters_log_lines(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = (
            "Running experiments for support set size: 1\n"
            "Loading model weights: 42\n"
            "Training epoch: 5\n"
            "loss: 0.123\n"
            "accuracy: 0.95\n"
        )
        metrics = _parse_metrics_from_stdout(stdout)
        assert "loss" in metrics
        assert "accuracy" in metrics
        assert len(metrics) == 2  # log lines should be excluded

    def test_filters_long_name_lines(self) -> None:
        from researchclaw.pipeline.executor import _parse_metrics_from_stdout

        stdout = "this is a very long status message that should not be a metric: 42\n"
        metrics = _parse_metrics_from_stdout(stdout)
        assert len(metrics) == 0


class TestDetectRuntimeIssues:
    """Tests for _detect_runtime_issues() helper."""

    def _make_sandbox_result(
        self,
        metrics: dict | None = None,
        stdout: str = "",
        stderr: str = "",
    ):
        from types import SimpleNamespace

        return SimpleNamespace(
            metrics=metrics or {},
            stdout=stdout,
            stderr=stderr,
            returncode=0,
            elapsed_sec=1.0,
            timed_out=False,
        )

    def test_no_issues_returns_empty_string(self) -> None:
        r = self._make_sandbox_result(metrics={"loss": 0.5}, stdout="loss: 0.5")
        assert rc_executor._detect_runtime_issues(r) == ""

    def test_detects_nan_in_metrics(self) -> None:
        r = self._make_sandbox_result(metrics={"loss": float("nan")})
        result = rc_executor._detect_runtime_issues(r)
        assert "NaN" in result
        assert "loss" in result

    def test_detects_inf_in_metrics(self) -> None:
        r = self._make_sandbox_result(metrics={"loss": float("inf")})
        result = rc_executor._detect_runtime_issues(r)
        assert "Inf" in result

    def test_detects_nan_in_stdout(self) -> None:
        r = self._make_sandbox_result(stdout="accuracy: nan\nloss: 0.5")
        result = rc_executor._detect_runtime_issues(r)
        assert "NaN" in result or "nan" in result

    def test_detects_runtime_warning_in_stderr(self) -> None:
        stderr = (
            "optimizers.py:76: RuntimeWarning: invalid value encountered in divide\n"
            "  directions = np.vstack((directions[1:], new_direction / norm))\n"
        )
        r = self._make_sandbox_result(stderr=stderr)
        result = rc_executor._detect_runtime_issues(r)
        assert "RuntimeWarning" in result
        assert "invalid value" in result

    def test_detects_division_error_in_stderr(self) -> None:
        stderr = "ZeroDivisionError: division by zero\n"
        r = self._make_sandbox_result(stderr=stderr)
        result = rc_executor._detect_runtime_issues(r)
        assert "Error" in result

    def test_ignores_benign_stderr(self) -> None:
        # Non-warning stderr should be ignored
        r = self._make_sandbox_result(stderr="Loading module...\nDone.\n")
        assert rc_executor._detect_runtime_issues(r) == ""

    def test_combined_nan_and_stderr(self) -> None:
        r = self._make_sandbox_result(
            metrics={"accuracy": float("nan")},
            stderr="RuntimeWarning: invalid value\n",
        )
        result = rc_executor._detect_runtime_issues(r)
        assert "NaN" in result
        assert "RuntimeWarning" in result

    def test_detects_dummy_metric_identical_values(self) -> None:
        stdout = (
            "UCB (Stochastic) convergence_rate: 1.0000\n"
            "UCB (Adversarial) convergence_rate: 1.0000\n"
            "Thompson (Stochastic) convergence_rate: 1.0000\n"
            "Thompson (Adversarial) convergence_rate: 1.0000\n"
        )
        r = self._make_sandbox_result(stdout=stdout)
        result = rc_executor._detect_runtime_issues(r)
        assert "DUMMY" in result
        assert "convergence_rate" in result

    def test_no_dummy_metric_when_values_differ(self) -> None:
        stdout = (
            "UCB (Stochastic) regret: 78.5\n"
            "Thompson (Stochastic) regret: 121.0\n"
            "EpsilonGreedy (Stochastic) regret: 42.1\n"
        )
        r = self._make_sandbox_result(stdout=stdout)
        result = rc_executor._detect_runtime_issues(r)
        assert "DUMMY" not in result


class TestRemoveBibtexEntries:
    """Tests for _remove_bibtex_entries() helper."""

    def test_removes_specified_keys(self) -> None:
        bib = (
            '@article{smith2024,\n  title={Good Paper},\n  author={Smith},\n}\n\n'
            '@article{venus2024,\n  title={Venus Exploration},\n  author={NASA},\n}\n'
        )
        result = rc_executor._remove_bibtex_entries(bib, {"venus2024"})
        assert "smith2024" in result
        assert "venus2024" not in result

    def test_keeps_all_when_no_match(self) -> None:
        bib = '@article{smith2024,\n  title={Paper},\n}\n'
        result = rc_executor._remove_bibtex_entries(bib, {"other_key"})
        assert "smith2024" in result

    def test_empty_bib(self) -> None:
        assert rc_executor._remove_bibtex_entries("", {"key"}) == ""


class TestRemoveCitationsFromText:
    """Tests for _remove_citations_from_text() helper."""

    def test_removes_latex_cite(self) -> None:
        text = r"As shown in \cite{venus2024}, the results are..."
        result = rc_executor._remove_citations_from_text(text, {"venus2024"})
        assert "venus2024" not in result
        assert "results are" in result

    def test_removes_markdown_cite(self) -> None:
        text = "Prior work [venus2024] explored this topic."
        result = rc_executor._remove_citations_from_text(text, {"venus2024"})
        assert "venus2024" not in result

    def test_cleans_multi_cite_comma(self) -> None:
        text = r"\cite{good2024,venus2024}"
        result = rc_executor._remove_citations_from_text(text, {"venus2024"})
        assert r"\cite{good2024}" in result


class TestCollectRawExperimentMetrics:
    """Tests for _collect_raw_experiment_metrics() helper."""

    def test_returns_empty_when_no_runs(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        block, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir)
        assert block == ""
        assert not has_parsed

    def test_extracts_metrics_from_stdout(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True)
        payload = {
            "metrics": {},
            "stdout": "UCB regret: 361.92\nThompson regret: 576.24\n",
        }
        (runs_dir / "run-1.json").write_text(json.dumps(payload))
        result, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir)
        assert "361.92" in result
        assert "576.24" in result
        assert "1 run(s)" in result
        assert not has_parsed

    def test_extracts_from_metrics_dict(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True)
        payload = {"metrics": {"loss": 0.042, "accuracy": 0.95}, "stdout": ""}
        (runs_dir / "run-1.json").write_text(json.dumps(payload))
        result, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir)
        assert "loss" in result
        assert "0.042" in result
        assert has_parsed

    def test_deduplicates_metrics(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True)
        payload = {
            "metrics": {"loss": 0.5},
            "stdout": "loss: 0.5\nloss: 0.5\n",
        }
        (runs_dir / "run-1.json").write_text(json.dumps(payload))
        result, _ = rc_executor._collect_raw_experiment_metrics(run_dir)
        # "loss: 0.5" should appear only once (deduplicated)
        assert result.count("loss: 0.5") == 1


class TestCollectExperimentEvidence:
    """Tests for _collect_experiment_evidence() helper."""

    def test_returns_empty_when_no_artifacts(self, tmp_path: Path) -> None:
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        assert rc_executor._collect_experiment_evidence(run_dir) == ""

    def test_includes_main_py_code(self, run_dir: Path) -> None:
        exp_dir = run_dir / "stage-10" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('hello')", encoding="utf-8")
        result = rc_executor._collect_experiment_evidence(run_dir)
        assert "main.py" in result
        assert "hello" in result

    def test_includes_run_metrics(self, run_dir: Path) -> None:
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"metrics": {"loss": 0.5}, "elapsed_sec": 3.2}),
            encoding="utf-8",
        )
        result = rc_executor._collect_experiment_evidence(run_dir)
        assert "loss" in result
        assert "0.5" in result

    def test_includes_stderr_excerpt(self, run_dir: Path) -> None:
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "metrics": {"loss": 0.5},
                "stderr": "RuntimeWarning: divide by zero",
            }),
            encoding="utf-8",
        )
        result = rc_executor._collect_experiment_evidence(run_dir)
        assert "divide by zero" in result

    def test_includes_refinement_summary(self, run_dir: Path) -> None:
        refine_dir = run_dir / "stage-13"
        refine_dir.mkdir(parents=True, exist_ok=True)
        (refine_dir / "refinement_log.json").write_text(
            json.dumps({
                "iterations": [{"iteration": 1}, {"iteration": 2}],
                "converged": True,
                "stop_reason": "no_improvement_for_2_iterations",
                "best_metric": 0.3,
            }),
            encoding="utf-8",
        )
        result = rc_executor._collect_experiment_evidence(run_dir)
        assert "iterations_executed" in result
        assert "2" in result

    def test_includes_actual_trial_count(self, run_dir: Path) -> None:
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"metrics": {"loss": 0.5}}), encoding="utf-8"
        )
        result = rc_executor._collect_experiment_evidence(run_dir)
        assert "1 time(s)" in result
        assert "CRITICAL" in result


class TestWritePaperSections:
    """Tests for _write_paper_sections() multi-call writing."""

    def test_produces_three_part_draft(self) -> None:
        call_count = {"n": 0}
        parts = [
            "# Test Title\n\n## Abstract\nTest abstract.\n\n## Introduction\nTest intro.\n\n## Related Work\nTest related.",
            "## Method\nTest method.\n\n## Experiments\nTest experiments.",
            "## Results\nTest results.\n\n## Discussion\nTest discussion.\n\n## Limitations\nTest limits.\n\n## Conclusion\nTest conclusion.",
        ]

        class MultiCallLLM:
            def __init__(self):
                self.calls: list = []

            def chat(self, messages, **kwargs):
                self.calls.append(messages)
                from researchclaw.llm.client import LLMResponse
                idx = len(self.calls) - 1
                return LLMResponse(content=parts[min(idx, 2)], model="fake")

        llm = MultiCallLLM()
        from researchclaw.prompts import PromptManager
        pm = PromptManager()

        draft = rc_executor._write_paper_sections(
            llm=llm,
            pm=pm,
            preamble="Test preamble",
            topic_constraint="",
            exp_metrics_instruction="",
            citation_instruction="",
            outline="Test outline",
        )

        assert llm.calls is not None
        assert len(llm.calls) == 3
        assert "## Abstract" in draft
        assert "## Method" in draft
        assert "## Results" in draft
        assert "## Conclusion" in draft

    def test_each_call_receives_prior_context(self) -> None:
        class ContextTrackingLLM:
            def __init__(self):
                self.user_prompts: list[str] = []

            def chat(self, messages, **kwargs):
                for m in messages:
                    if m.get("role") == "user":
                        self.user_prompts.append(m["content"])
                from researchclaw.llm.client import LLMResponse
                return LLMResponse(content="## Section\nContent here.", model="fake")

        llm = ContextTrackingLLM()
        from researchclaw.prompts import PromptManager
        pm = PromptManager()

        rc_executor._write_paper_sections(
            llm=llm,
            pm=pm,
            preamble="Preamble",
            topic_constraint="",
            exp_metrics_instruction="",
            citation_instruction="",
            outline="Outline",
        )

        assert len(llm.user_prompts) == 3
        # Call 2 and 3 should contain "sections written so far"
        assert "sections written so far" in llm.user_prompts[1]
        assert "completing a paper" in llm.user_prompts[2]


class TestLoadHardwareProfile:
    """Tests for _load_hardware_profile()."""

    @pytest.fixture()
    def run_dir(self, tmp_path: Path) -> Path:
        d = tmp_path / "run"
        d.mkdir()
        return d

    def test_loads_valid_profile(self, run_dir: Path) -> None:
        stage = run_dir / "stage-01"
        stage.mkdir()
        profile = {"has_gpu": True, "gpu_type": "mps", "tier": "limited"}
        (stage / "hardware_profile.json").write_text(
            json.dumps(profile), encoding="utf-8"
        )
        result = rc_executor._load_hardware_profile(run_dir)
        assert result is not None
        assert result["gpu_type"] == "mps"

    def test_returns_none_when_missing(self, run_dir: Path) -> None:
        assert rc_executor._load_hardware_profile(run_dir) is None

    def test_returns_none_on_invalid_json(self, run_dir: Path) -> None:
        stage = run_dir / "stage-01"
        stage.mkdir()
        (stage / "hardware_profile.json").write_text("not json", encoding="utf-8")
        assert rc_executor._load_hardware_profile(run_dir) is None


class TestExpandSearchQueries:
    """Tests for _expand_search_queries()."""

    def test_adds_broader_queries(self) -> None:
        queries = ["gradient descent optimization algorithms"]
        topic = "Comparing gradient descent optimization algorithms on benchmark functions"
        result = rc_executor._expand_search_queries(queries, topic)
        assert len(result) > len(queries)

    def test_deduplicates(self) -> None:
        queries = ["gradient descent survey"]
        topic = "gradient descent optimization"
        result = rc_executor._expand_search_queries(queries, topic)
        lowered = [q.lower().strip() for q in result]
        assert len(lowered) == len(set(lowered))

    def test_preserves_original_queries(self) -> None:
        queries = ["query A", "query B"]
        topic = "some research topic about machine learning methods"
        result = rc_executor._expand_search_queries(queries, topic)
        assert result[0] == "query A"
        assert result[1] == "query B"

    def test_adds_survey_benchmark_variants(self) -> None:
        queries = ["deep learning"]
        topic = "deep learning for image classification with limited data"
        result = rc_executor._expand_search_queries(queries, topic)
        has_survey = any("survey" in q.lower() for q in result)
        has_benchmark = any("benchmark" in q.lower() for q in result)
        assert has_survey
        assert has_benchmark


# ── R4-1: Experiment Budget Guard Tests ──────────────────────────────


class TestComputeBudgetBlock:
    """Test compute_budget prompt block injection (R4-1a)."""

    def test_compute_budget_block_exists_in_prompt_manager(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        block = pm.block("compute_budget")
        assert "time_budget_sec" in block or "Compute Budget" in block

    def test_compute_budget_injected_into_code_generation(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        import sys

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {
                "topic": "optimizer comparison",
                "domains": ["ml"],
                "daily_paper_count": 2,
                "quality_threshold": 8.2,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline-test-key",
                "primary_model": "fake-model",
                "fallback_models": [],
            },
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 60,
                "metric_key": "best_loss",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 1024,
                },
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        # Write exp_plan prior artifact
        _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test")

        # Capture what the LLM receives
        llm = FakeLLMClient(
            "```filename:main.py\nimport numpy as np\nprint('best_loss: 0.1')\n```"
        )
        stage_dir = run_dir / "stage-11"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_code_generation(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        # The LLM should have received compute budget info in some call
        # (may be first call in legacy mode, or second call with CodeAgent)
        assert len(llm.calls) >= 1
        all_user_msgs = " ".join(
            call[-1]["content"] for call in llm.calls if call
        )
        assert "60" in all_user_msgs or "Compute Budget" in all_user_msgs


class TestPartialTimeoutStatus:
    """Test partial status for timed-out experiments with data (R4-1c)."""

    def test_timed_out_with_metrics_sets_partial_status(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        import sys

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {
                "topic": "test topic",
                "domains": ["ml"],
                "daily_paper_count": 2,
                "quality_threshold": 8.2,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline-test-key",
                "primary_model": "fake-model",
                "fallback_models": [],
            },
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 2,
                "metric_key": "best_loss",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 1024,
                },
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        # Write experiment code that prints some metrics then sleeps
        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text(
            "import time, sys\n"
            "print('best_loss: 0.5', flush=True)\n"
            "sys.stdout.flush()\n"
            "time.sleep(10)\n",
            encoding="utf-8",
        )

        stage_dir = run_dir / "stage-12"
        stage_dir.mkdir(parents=True, exist_ok=True)

        rc_executor._execute_experiment_run(
            stage_dir, run_dir, cfg, adapters
        )

        run_file = stage_dir / "runs" / "run-1.json"
        assert run_file.exists()
        payload = json.loads(run_file.read_text(encoding="utf-8"))
        # Should be "partial" since metrics were captured before timeout
        assert payload["timed_out"] is True
        # Status should be "partial" if metrics captured, "failed" if not
        if payload["metrics"]:
            assert payload["status"] == "partial"
        else:
            # Subprocess stdout may not flush before kill on some platforms
            assert payload["status"] == "failed"


class TestTimeoutAwareRefine:
    """Test timeout-aware prompt injection in iterative refine (R4-1b)."""

    def _prepare_timed_out_run(self, run_dir: Path) -> None:
        """Create a prior run that timed out with no metrics."""
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "task_id": "sandbox-main",
                "status": "failed",
                "metrics": {},
                "timed_out": True,
                "elapsed_sec": 120.0,
            }),
            encoding="utf-8",
        )
        # Write experiment code
        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text(
            "print('best_loss: 0.1')\n",
            encoding="utf-8",
        )

    def test_timeout_refine_injects_scale_reduction_prompt(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        self._prepare_timed_out_run(run_dir)
        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {
                "topic": "test topic",
                "domains": ["ml"],
                "daily_paper_count": 2,
                "quality_threshold": 8.2,
            },
            "runtime": {"timezone": "UTC"},
            "notifications": {
                "channel": "local",
                "on_stage_start": True,
                "on_stage_fail": False,
                "on_gate_required": True,
            },
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline-test-key",
                "primary_model": "fake-model",
                "fallback_models": [],
            },
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 120,
                "max_iterations": 1,
                "metric_key": "best_loss",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        llm = FakeLLMClient(
            "```python\nimport numpy as np\nprint('best_loss: 0.1')\n```"
        )

        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        # The LLM should have received the timeout-aware prompt
        assert len(llm.calls) >= 1
        user_msg = llm.calls[0][-1]["content"]
        assert "TIMED OUT" in user_msg
        assert "120" in user_msg


# ── R4-2: Data Integrity Enforcement Tests ───────────────────────────


class TestDataIntegrityBlock:
    """Test paper draft blocked when no metrics exist (R4-2a)."""

    def test_paper_draft_blocked_with_no_metrics(
        self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        # Write prior artifacts with NO metrics
        _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n## Abstract\n")
        # No experiment_summary.json, no run files with metrics
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"run_id": "run-1", "status": "failed", "metrics": {}, "timed_out": True}),
            encoding="utf-8",
        )

        stage_dir = run_dir / "stage-17"
        stage_dir.mkdir(parents=True, exist_ok=True)

        llm = FakeLLMClient("should not be called")
        result = rc_executor._execute_paper_draft(
            stage_dir, run_dir, rc_config, adapters, llm=llm
        )

        assert result.status == StageStatus.FAILED
        draft = (stage_dir / "paper_draft.md").read_text(encoding="utf-8")
        assert "Blocked" in draft or "BLOCKED" in draft or "no metrics" in draft.lower()
        # LLM should NOT have been called
        assert len(llm.calls) == 0

    def test_paper_draft_proceeds_with_metrics(
        self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n## Abstract\n")
        # Write experiment data with real metrics
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "completed",
                "metrics": {"best_loss": 0.123},
                "stdout": "best_loss: 0.123\n",
            }),
            encoding="utf-8",
        )

        stage_dir = run_dir / "stage-17"
        stage_dir.mkdir(parents=True, exist_ok=True)

        llm = FakeLLMClient("# Paper Title\n## Abstract\nSome abstract text.")
        result = rc_executor._execute_paper_draft(
            stage_dir, run_dir, rc_config, adapters, llm=llm
        )

        # Should proceed (LLM was called)
        assert len(llm.calls) >= 1
        # The prompt should contain anti-fabrication instructions
        all_prompts = " ".join(
            msg["content"] for call in llm.calls for msg in call
        )
        assert "Data Integrity" in all_prompts or "ONLY report numbers" in all_prompts


# ── R4-3: Conference-Grade Title Guidelines Tests ────────────────────


class TestTitleGuidelines:
    """Test title_guidelines and abstract_structure blocks (R4-3)."""

    def test_title_guidelines_block_exists(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        block = pm.block("title_guidelines")
        assert "novelty" in block.lower() or "TITLE RULES" in block
        assert "14 words" in block or "15 words" in block or "concrete" in block.lower()

    def test_abstract_structure_block_exists(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        block = pm.block("abstract_structure")
        assert "5-sentence" in block or "problem" in block.lower()

    def test_title_guidelines_injected_into_paper_draft(
        self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle
    ) -> None:
        _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n")
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"run_id": "run-1", "status": "completed",
                        "metrics": {"best_loss": 0.1}, "stdout": "best_loss: 0.1\n"}),
            encoding="utf-8",
        )

        stage_dir = run_dir / "stage-17"
        stage_dir.mkdir(parents=True, exist_ok=True)

        llm = FakeLLMClient("# Paper Title\n## Abstract\nText.")
        rc_executor._execute_paper_draft(
            stage_dir, run_dir, rc_config, adapters, llm=llm
        )

        all_prompts = " ".join(
            msg["content"] for call in llm.calls for msg in call
        )
        assert "Title" in all_prompts or "TITLE" in all_prompts


# ── R4-4: Conference-Grade Writing Quality Tests ─────────────────────


class TestConferenceWritingQuality:
    """Test enhanced writing prompts and writing_guide.py (R4-4)."""

    def test_writing_guide_format_all(self) -> None:
        from researchclaw.writing_guide import format_writing_tips

        result = format_writing_tips()
        assert "Conference Writing Best Practices" in result
        assert "Title" in result
        assert "Common Rejections" in result

    def test_writing_guide_format_subset(self) -> None:
        from researchclaw.writing_guide import format_writing_tips

        result = format_writing_tips(["title", "abstract"])
        assert "Title" in result
        assert "Abstract" in result
        assert "Common Rejections" not in result

    def test_paper_draft_system_includes_principles(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        sp = pm.for_stage(
            "paper_draft",
            preamble="test",
            topic_constraint="test",
            exp_metrics_instruction="test",
            citation_instruction="test",
            outline="test",
        )
        # System prompt should mention key principles
        assert "NOVELTY" in sp.system or "novelty" in sp.system.lower()
        assert "fabricate" in sp.system.lower() or "real experimental" in sp.system.lower()


# ── R5-1 & R5-2: Bug Fixes Tests ────────────────────────────────────


class TestRefineTimeoutAndIterationCap:
    """Test R5-1 (no 120s cap) and R5-2 (iteration cap raised to 10)."""

    def test_refine_timeout_uses_full_budget(self) -> None:
        """R5-1: Refine sandbox should NOT cap at 120s."""
        import ast
        import inspect

        source = inspect.getsource(rc_executor._execute_iterative_refine)
        tree = ast.parse(source)
        source_text = inspect.getsource(rc_executor._execute_iterative_refine)
        # Should NOT contain min(..., 120)
        assert "min(config.experiment.time_budget_sec, 120)" not in source_text

    def test_iteration_cap_is_10(self) -> None:
        """R5-2: Max iterations should be capped at 10, not 3."""
        import inspect

        source = inspect.getsource(rc_executor._execute_iterative_refine)
        assert "min(requested_iterations, 10)" in source
        assert "min(requested_iterations, 3)" not in source

    def test_refine_respects_high_iteration_count(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        """R5-2: Setting max_iterations=7 should actually allow 7 iterations."""
        # Write prior run artifacts
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({"run_id": "run-1", "status": "completed",
                        "metrics": {"best_loss": 0.5}}),
            encoding="utf-8",
        )
        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('best_loss: 0.5')\n", encoding="utf-8")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 300,
                "max_iterations": 7,
                "metric_key": "best_loss",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        # LLM always returns same code — will trigger no_improvement early stop
        llm = FakeLLMClient("```python\nprint('best_loss: 0.5')\n```")

        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        log = json.loads((stage_dir / "refinement_log.json").read_text(encoding="utf-8"))
        # Should have been allowed more than 3 iterations (capped at 7)
        assert log["max_iterations_executed"] == 7
        # But may have stopped early due to no_improvement_for_2_iterations
        assert len(log["iterations"]) >= 2


# ── R5-3: NaN/Divergence Fast-Fail Tests ────────────────────────────


class TestNaNDivergenceDetection:
    """Test NaN/Inf filtering and divergence detection (R5-3)."""

    def test_parse_metrics_filters_nan(self) -> None:
        from researchclaw.experiment.sandbox import parse_metrics

        stdout = "best_loss: 0.5\nbad_metric: nan\ngood_metric: 1.23\n"
        metrics = parse_metrics(stdout)
        assert "best_loss" in metrics
        assert "good_metric" in metrics
        assert "bad_metric" not in metrics  # NaN should be filtered

    def test_parse_metrics_filters_inf(self) -> None:
        from researchclaw.experiment.sandbox import parse_metrics

        stdout = "metric_a: inf\nmetric_b: -inf\nmetric_c: 0.42\n"
        metrics = parse_metrics(stdout)
        assert "metric_c" in metrics
        assert "metric_a" not in metrics
        assert "metric_b" not in metrics

    def test_detect_nan_divergence_finds_nan(self) -> None:
        from researchclaw.experiment.sandbox import detect_nan_divergence

        result = detect_nan_divergence("loss: nan\nstep 5 done", "")
        assert result is not None
        assert "NaN" in result or "nan" in result.lower()

    def test_detect_nan_divergence_finds_diverging_loss(self) -> None:
        from researchclaw.experiment.sandbox import detect_nan_divergence

        result = detect_nan_divergence("best_loss: 999.5\n", "")
        assert result is not None
        assert "loss" in result.lower() or "999" in result

    def test_detect_nan_divergence_returns_none_for_clean(self) -> None:
        from researchclaw.experiment.sandbox import detect_nan_divergence

        result = detect_nan_divergence("best_loss: 0.123\naccuracy: 0.95\n", "")
        assert result is None

    def test_runtime_issues_detects_diverging_loss(self) -> None:
        from types import SimpleNamespace

        fake_result = SimpleNamespace(
            metrics={"best_loss": 500.0},
            stdout="best_loss: 500.0\n",
            stderr="",
        )
        issues = rc_executor._detect_runtime_issues(fake_result)
        assert "DIVERGING" in issues or "diverging" in issues.lower()

    def test_compute_budget_includes_nan_guard(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        block = pm.block("compute_budget")
        assert "NaN" in block or "nan" in block.lower() or "divergence" in block.lower()


# ── R5-4: Experiment Harness Template Tests ──────────────────────────


class TestExperimentHarness:
    """Test the immutable experiment harness (R5-4)."""

    def test_harness_should_stop(self) -> None:
        from researchclaw.experiment.harness_template import ExperimentHarness

        h = ExperimentHarness(time_budget=1)
        assert not h.should_stop()  # Just created, not at 80% yet
        import time
        time.sleep(0.9)
        assert h.should_stop()  # Should be past 80% of 1s

    def test_harness_report_metric(self, capsys: pytest.CaptureFixture[str]) -> None:
        from researchclaw.experiment.harness_template import ExperimentHarness

        h = ExperimentHarness(time_budget=60)
        h.report_metric("best_loss", 0.123)
        captured = capsys.readouterr()
        assert "best_loss: 0.123" in captured.out
        assert h._metrics["best_loss"] == 0.123

    def test_harness_rejects_nan(self, capsys: pytest.CaptureFixture[str]) -> None:
        from researchclaw.experiment.harness_template import ExperimentHarness

        h = ExperimentHarness(time_budget=60)
        h.report_metric("bad", float("nan"))
        captured = capsys.readouterr()
        assert "bad" not in h._metrics
        assert "non-finite" in captured.err.lower() or "WARNING" in captured.err

    def test_harness_rejects_inf(self, capsys: pytest.CaptureFixture[str]) -> None:
        from researchclaw.experiment.harness_template import ExperimentHarness

        h = ExperimentHarness(time_budget=60)
        h.report_metric("bad", float("inf"))
        assert "bad" not in h._metrics

    def test_harness_finalize(self, tmp_path: Path) -> None:
        import os
        from researchclaw.experiment.harness_template import ExperimentHarness

        old_cwd = os.getcwd()
        os.chdir(tmp_path)
        try:
            h = ExperimentHarness(time_budget=60)
            h.report_metric("accuracy", 0.95)
            h.report_metric("loss", 0.05)
            h.log_result({"condition": "A", "value": 1.0})
            h.finalize()

            results = json.loads((tmp_path / "results.json").read_text(encoding="utf-8"))
            assert results["metrics"]["accuracy"] == 0.95
            assert results["metrics"]["loss"] == 0.05
            assert len(results["results"]) == 1
        finally:
            os.chdir(old_cwd)

    def test_harness_progress(self) -> None:
        from researchclaw.experiment.harness_template import ExperimentHarness

        h = ExperimentHarness(time_budget=1000)
        assert h.progress < 0.01  # Just started
        assert 0.0 <= h.progress <= 1.0

    def test_harness_injected_into_sandbox(self, tmp_path: Path) -> None:
        import sys
        from researchclaw.config import SandboxConfig
        from researchclaw.experiment.sandbox import ExperimentSandbox

        config = SandboxConfig(python_path=sys.executable)
        sandbox = ExperimentSandbox(config, tmp_path / "sandbox")

        # Create a project dir
        project = tmp_path / "project"
        project.mkdir()
        (project / "main.py").write_text("print('test: 1.0')\n", encoding="utf-8")

        sandbox.run_project(project, timeout_sec=5)

        # Check that harness was injected (BUG-DA8-06: dir is now _project_{N})
        project_dirs = list((tmp_path / "sandbox").glob("_project_*"))
        assert project_dirs, "No _project_N directory found"
        harness_path = project_dirs[0] / "experiment_harness.py"
        assert harness_path.exists()
        content = harness_path.read_text(encoding="utf-8")
        assert "ExperimentHarness" in content

    def test_harness_not_overwritten_by_project(self, tmp_path: Path) -> None:
        import sys
        from researchclaw.config import SandboxConfig
        from researchclaw.experiment.sandbox import ExperimentSandbox

        config = SandboxConfig(python_path=sys.executable)
        sandbox = ExperimentSandbox(config, tmp_path / "sandbox")

        # Create a project with a fake experiment_harness.py
        project = tmp_path / "project"
        project.mkdir()
        (project / "main.py").write_text("print('test: 1.0')\n", encoding="utf-8")
        (project / "experiment_harness.py").write_text("# FAKE HARNESS", encoding="utf-8")

        sandbox.run_project(project, timeout_sec=5)

        # The real harness should be there, not the fake one (BUG-DA8-06)
        project_dirs = list((tmp_path / "sandbox").glob("_project_*"))
        assert project_dirs
        harness_path = project_dirs[0] / "experiment_harness.py"
        content = harness_path.read_text(encoding="utf-8")
        assert "ExperimentHarness" in content
        assert "FAKE HARNESS" not in content

    def test_prompt_mentions_harness(self) -> None:
        from researchclaw.prompts import PromptManager

        pm = PromptManager()
        block = pm.block("compute_budget")
        assert "experiment_harness" in block or "ExperimentHarness" in block


# ── R5-5: Stdout Truncation Tests ────────────────────────────────────


class TestStdoutTruncation:
    """Test stdout/stderr truncation in refine run summaries (R5-5)."""

    def test_long_stdout_truncated_in_refine(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        # Create a run with very long stdout
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        long_stdout = "\n".join(f"step {i}: loss={0.5 - i * 0.001:.6f}" for i in range(200))
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "completed",
                "metrics": {"best_loss": 0.3},
                "stdout": long_stdout,
            }),
            encoding="utf-8",
        )

        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('best_loss: 0.3')\n", encoding="utf-8")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "best_loss",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        llm = FakeLLMClient("```python\nprint('best_loss: 0.3')\n```")
        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        # The LLM should have received truncated stdout, not all 200 lines
        assert len(llm.calls) >= 1
        user_msg = llm.calls[0][-1]["content"]
        # Should contain truncation indicator
        assert "truncated" in user_msg or len(user_msg) < len(long_stdout)


# ===================================================================
# R6 Tests — Post-E2E Failure Analysis Fixes
# ===================================================================


class TestNoImproveStreakFix:
    """R6-1: no_improve_streak should only count iterations with real metrics."""

    def test_empty_metrics_dont_increment_streak(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        """When metrics are empty (None), the streak should NOT increment."""
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "failed",
                "metrics": {},
                "stdout": "FAIL: NaN/divergence detected",
            }),
            encoding="utf-8",
        )
        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('hello')\n", encoding="utf-8")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 4,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        # LLM returns code that won't produce metrics in simulated mode
        llm = FakeLLMClient("```python\nprint('no metrics here')\n```")
        result = rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        # Should abort after 3 consecutive no-metrics iterations
        log_path = stage_dir / "refinement_log.json"
        log_data = json.loads(log_path.read_text())
        # consecutive_no_metrics triggers early abort after 3 iterations
        assert len(log_data["iterations"]) == 3
        assert log_data.get("stop_reason") == "consecutive_no_metrics"


class TestStdoutFailureDetection:
    """R6-2: Detect stdout failure signals even when exit code is 0."""

    def test_fail_signal_in_stdout_marks_failed(self, tmp_path: Path) -> None:
        """Exit code 0 + 'FAIL:' in stdout + no metrics → status='failed'."""
        from researchclaw.pipeline.executor import _execute_experiment_run

        # Create necessary structure
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        (run_dir / "stage-10").mkdir()
        exp_dir = run_dir / "stage-10" / "experiment"
        exp_dir.mkdir()
        # Simple code that prints FAIL but exits 0
        (exp_dir / "main.py").write_text(
            "print('FAIL: NaN/divergence detected')\n", encoding="utf-8"
        )
        (run_dir / "stage-11").mkdir()
        (run_dir / "stage-11" / "schedule.json").write_text("{}", encoding="utf-8")

        stage_dir = run_dir / "stage-12"
        stage_dir.mkdir()

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 512,
                    "allowed_imports": ["json"],
                },
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        adapters = AdapterBundle()

        result = _execute_experiment_run(
            stage_dir, run_dir, cfg, adapters
        )

        # Check the run payload
        runs_dir = stage_dir / "runs"
        run_file = runs_dir / "run-1.json"
        assert run_file.exists()
        payload = json.loads(run_file.read_text())
        assert payload["status"] == "failed"

    def test_clean_exit_no_fail_signal_marks_completed(self, tmp_path: Path) -> None:
        """Exit code 0 + valid metrics + no FAIL signal → status='completed'."""
        from researchclaw.pipeline.executor import _execute_experiment_run

        run_dir = tmp_path / "run"
        run_dir.mkdir()
        (run_dir / "stage-10").mkdir()
        exp_dir = run_dir / "stage-10" / "experiment"
        exp_dir.mkdir()
        (exp_dir / "main.py").write_text(
            "print('primary_metric: 0.95')\n", encoding="utf-8"
        )
        (run_dir / "stage-11").mkdir()
        (run_dir / "stage-11" / "schedule.json").write_text("{}", encoding="utf-8")

        stage_dir = run_dir / "stage-12"
        stage_dir.mkdir()

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
                "sandbox": {
                    "python_path": sys.executable,
                    "gpu_required": False,
                    "max_memory_mb": 512,
                    "allowed_imports": ["json"],
                },
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        adapters = AdapterBundle()

        result = _execute_experiment_run(
            stage_dir, run_dir, cfg, adapters
        )

        runs_dir = stage_dir / "runs"
        payload = json.loads((runs_dir / "run-1.json").read_text())
        assert payload["status"] == "completed"


class TestMetricValUndefined:
    """R6-3: metric_val should be initialized to None before conditional block."""

    def test_metric_val_initialized_before_use(self) -> None:
        """Verify the code pattern: metric_val = None before if block."""
        import inspect
        source = inspect.getsource(rc_executor._execute_iterative_refine)
        # Find that metric_val = None appears before the sandbox block
        init_pos = source.find("metric_val = None")
        sandbox_pos = source.find("if validation.ok and config.experiment.mode")
        assert init_pos != -1, "metric_val = None not found"
        assert sandbox_pos != -1, "sandbox block not found"
        assert init_pos < sandbox_pos, "metric_val = None should come before sandbox block"


class TestConsecutiveEmptyMetrics:
    """R6-4: Pipeline should detect consecutive empty-metrics REFINE cycles."""

    def test_detects_consecutive_empty(self, tmp_path: Path) -> None:
        """Two cycles with empty metrics should return True."""
        from researchclaw.pipeline.runner import _consecutive_empty_metrics

        run_dir = tmp_path / "run"
        # Current cycle (stage-14)
        s14 = run_dir / "stage-14"
        s14.mkdir(parents=True)
        (s14 / "experiment_summary.json").write_text(json.dumps({
            "metrics_summary": {},
            "best_run": {"metrics": {}},
        }))
        # Previous cycle (stage-14_v1)
        s14v1 = run_dir / "stage-14_v1"
        s14v1.mkdir(parents=True)
        (s14v1 / "experiment_summary.json").write_text(json.dumps({
            "metrics_summary": {},
            "best_run": {"metrics": {}},
        }))

        assert _consecutive_empty_metrics(run_dir, pivot_count=1) is True

    def test_not_empty_when_metrics_exist(self, tmp_path: Path) -> None:
        """If any cycle has real metrics, return False."""
        from researchclaw.pipeline.runner import _consecutive_empty_metrics

        run_dir = tmp_path / "run"
        s14 = run_dir / "stage-14"
        s14.mkdir(parents=True)
        (s14 / "experiment_summary.json").write_text(json.dumps({
            "metrics_summary": {},
            "best_run": {"metrics": {"loss": 0.5}},
        }))
        s14v1 = run_dir / "stage-14_v1"
        s14v1.mkdir(parents=True)
        (s14v1 / "experiment_summary.json").write_text(json.dumps({
            "metrics_summary": {},
            "best_run": {"metrics": {}},
        }))

        assert _consecutive_empty_metrics(run_dir, pivot_count=1) is False

    def test_false_when_no_previous_cycle(self, tmp_path: Path) -> None:
        """First cycle (no v1) should return False."""
        from researchclaw.pipeline.runner import _consecutive_empty_metrics

        run_dir = tmp_path / "run"
        s14 = run_dir / "stage-14"
        s14.mkdir(parents=True)
        (s14 / "experiment_summary.json").write_text(json.dumps({
            "metrics_summary": {},
            "best_run": {"metrics": {}},
        }))

        # No stage-14_v1 exists
        assert _consecutive_empty_metrics(run_dir, pivot_count=1) is False


# ===================================================================
# R7 Tests — Experiment-Paper Quality Alignment
# ===================================================================


class TestMultiConditionEnforcement:
    """R7-1: Code generation prompt must enforce multi-condition experiments."""

    def test_code_generation_prompt_has_multi_condition_block(self) -> None:
        """The code_generation prompt should contain multi-condition instructions."""
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "code_generation",
            topic="test topic",
            metric="primary_metric",
            pkg_hint="",
            exp_plan="conditions:\n  - echo_chamber\n  - bridge_building\n  - random",
        )
        assert "MULTI-CONDITION REQUIREMENT" in sp.user
        assert "condition=" in sp.user
        assert "SUMMARY" in sp.user

    def test_multi_condition_labels_required(self) -> None:
        """Prompt must mention per-condition labeled output format."""
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "code_generation",
            topic="test",
            metric="loss",
            pkg_hint="",
            exp_plan="treatments: [A, B, C]",
        )
        assert "condition=<name>" in sp.user


class TestEvidenceBoundedWriting:
    """R7-2: Paper draft prompt must enforce evidence-bounded claims."""

    def test_paper_draft_has_evidence_bounding_rules(self) -> None:
        """System prompt should contain evidence-bounding rules."""
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "paper_draft",
            preamble="test preamble",
            topic_constraint="",
            exp_metrics_instruction="",
            citation_instruction="",
            outline="# Outline",
        )
        assert "EVIDENCE-BOUNDING RULES" in sp.system
        assert "title" in sp.system.lower()
        assert "causal claim" in sp.system.lower() or "causal claims" in sp.system.lower()

    def test_hedging_language_guidance(self) -> None:
        """Should suggest hedged alternatives like 'Toward...' for partial data."""
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "paper_draft",
            preamble="",
            topic_constraint="",
            exp_metrics_instruction="",
            citation_instruction="",
            outline="",
        )
        assert "Toward" in sp.system or "Investigating" in sp.system


class TestConditionCoverageDetection:
    """R7-3: REFINE should detect condition coverage gaps."""

    def test_coverage_hint_injected_when_no_labels(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        """If stdout has no 'condition=' labels, a coverage hint should be injected."""
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "completed",
                "metrics": {"primary_metric": 0.5},
                "stdout": "primary_metric: 0.5\nprimary_metric: 0.3\n",
            }),
            encoding="utf-8",
        )

        exp_plan_dir = run_dir / "stage-09"
        exp_plan_dir.mkdir(parents=True, exist_ok=True)
        (exp_plan_dir / "exp_plan.yaml").write_text(
            "conditions:\n  - echo_chamber\n  - bridge_building\n  - random\n",
            encoding="utf-8",
        )

        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('primary_metric: 0.5')\n", encoding="utf-8")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        llm = FakeLLMClient("```python\nprint('primary_metric: 0.3')\n```")
        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        assert len(llm.calls) >= 1
        user_msg = llm.calls[0][-1]["content"]
        assert "CONDITION COVERAGE GAP" in user_msg

    def test_no_hint_when_labels_present(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        """If stdout already has 'condition=' labels, no hint should be injected."""
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "completed",
                "metrics": {"primary_metric": 0.5},
                "stdout": "condition=echo primary_metric: 0.5\ncondition=bridge primary_metric: 0.3\n",
            }),
            encoding="utf-8",
        )

        exp_plan_dir = run_dir / "stage-09"
        exp_plan_dir.mkdir(parents=True, exist_ok=True)
        (exp_plan_dir / "exp_plan.yaml").write_text(
            "conditions:\n  - echo\n  - bridge\n",
            encoding="utf-8",
        )

        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("print('primary_metric: 0.5')\n", encoding="utf-8")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        llm = FakeLLMClient("```python\nprint('primary_metric: 0.3')\n```")
        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        assert len(llm.calls) >= 1
        user_msg = llm.calls[0][-1]["content"]
        assert "CONDITION COVERAGE GAP" not in user_msg


# ===================================================================
# R8 Tests — AutoBench Round 1 Fixes
# ===================================================================


class TestBreadthFirstPrompt:
    """R8-1: Code generation prompt should require breadth-first condition ordering."""

    def test_breadth_first_in_code_generation(self) -> None:
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "code_generation",
            topic="test",
            metric="primary_metric",
            pkg_hint="",
            exp_plan="conditions: [A, B, C]",
        )
        assert "BREADTH-FIRST" in sp.user
        assert "ONE representative" in sp.user


class TestRefineFilePreservation:
    """R8-2: Refine should preserve supporting files when LLM only returns main.py."""

    def test_supporting_files_preserved_in_refine(
        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
    ) -> None:
        """When LLM returns only main.py, other project files should be preserved."""
        runs_dir = run_dir / "stage-12" / "runs"
        runs_dir.mkdir(parents=True, exist_ok=True)
        (runs_dir / "run-1.json").write_text(
            json.dumps({
                "run_id": "run-1",
                "status": "completed",
                "metrics": {"primary_metric": 0.5},
                "stdout": "primary_metric: 0.5",
            }),
            encoding="utf-8",
        )

        # Multi-file experiment project
        exp_dir = run_dir / "stage-11" / "experiment"
        exp_dir.mkdir(parents=True, exist_ok=True)
        (exp_dir / "main.py").write_text("from helpers import foo\nprint('primary_metric: 0.5')\n")
        (exp_dir / "helpers.py").write_text("def foo(): return 42\n")
        (exp_dir / "utils.py").write_text("def bar(): return 99\n")

        stage_dir = run_dir / "stage-13"
        stage_dir.mkdir(parents=True, exist_ok=True)

        data = {
            "project": {"name": "rc-test", "mode": "docs-first"},
            "research": {"topic": "test", "domains": ["ml"],
                         "daily_paper_count": 2, "quality_threshold": 8.2},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local", "on_stage_start": True,
                              "on_stage_fail": False, "on_gate_required": True},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {"use_memory": True, "use_message": True},
            "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1",
                    "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key",
                    "primary_model": "fake-model", "fallback_models": []},
            "security": {"hitl_required_stages": [5, 9, 20]},
            "experiment": {
                "mode": "sandbox",
                "time_budget_sec": 30,
                "max_iterations": 1,
                "metric_key": "primary_metric",
                "metric_direction": "minimize",
            },
        }
        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

        # LLM returns only main.py in multi-file format
        llm = FakeLLMClient("```filename:main.py\nfrom helpers import foo\nprint('primary_metric: 0.3')\n```")
        rc_executor._execute_iterative_refine(
            stage_dir, run_dir, cfg, adapters, llm=llm
        )

        # Check that experiment_v1 has ALL files, not just main.py
        v1_dir = stage_dir / "experiment_v1"
        assert v1_dir.exists()
        v1_files = {f.name for f in v1_dir.glob("*.py")}
        assert "main.py" in v1_files
        assert "helpers.py" in v1_files, "Supporting file helpers.py should be preserved"
        assert "utils.py" in v1_files, "Supporting file utils.py should be preserved"


# ===================================================================
# R9 Tests — AutoBench Round 2 Fixes
# ===================================================================


class TestCodeGenTopicNeutral:
    """R9-1: Code generation prompt should be topic-neutral, not optimization-biased."""

    def test_no_gradient_descent_bias(self) -> None:
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "code_generation",
            topic="multi-agent simulation",
            metric="primary_metric",
            pkg_hint="",
            exp_plan="conditions: [L1, L2, L3, L4]",
        )
        # Should NOT contain optimization-specific examples as recommended approaches
        assert "Adam" not in sp.user
        assert "SGD" not in sp.user
        assert "Rosenbrock" not in sp.user
        # "gradient descent" may appear as anti-pattern warning but not as example
        assert "e.g., gradient descent" not in sp.user

    def test_topic_relevant_guidance(self) -> None:
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.for_stage(
            "code_generation",
            topic="multi-agent simulation",
            metric="primary_metric",
            pkg_hint="",
            exp_plan="conditions: [L1, L2, L3, L4]",
        )
        # Should contain generic guidance that works for any topic
        assert "simulation" in sp.user.lower() or "appropriate" in sp.user.lower()
        assert "ACTUAL experiment" in sp.user or "relevant to the TOPIC" in sp.user


class TestRefineTopicAlignment:
    """R9-2: Refine prompt should include topic-code alignment check."""

    def test_topic_alignment_in_refine_prompt(self) -> None:
        from researchclaw.prompts import PromptManager
        pm = PromptManager()
        sp = pm.sub_prompt(
            "iterative_improve",
            metric_key="primary_metric",
            metric_direction="maximize",
            files_context="# main.py\nprint('hello')",
            run_summaries="{}",
            condition_coverage_hint="",
            topic="multi-agent diversity scaling",
            exp_plan_anchor="",
        )
        assert "EXPERIMENT PLAN ANCHOR" in sp.user
        assert "multi-agent diversity scaling" in sp.user
        assert "NEVER rename" in sp.user


# =====================================================================
# _validate_draft_quality tests
# =====================================================================


def _make_prose(word_count: int) -> str:  # noqa: E302
    """Generate flowing prose text of approximately *word_count* words."""
    sentence = (
        "This is a flowing academic prose sentence "
        "that demonstrates our research findings. "
    )
    words_per = len(sentence.split())
    return sentence * (word_count // words_per + 1)


def _make_bullets(word_count: int) -> str:
    """Generate bullet-point text of approximately *word_count* words."""
    line = "- This is a bullet point about a research finding\n"
    words_per = len(line.split())
    return line * (word_count // words_per + 1)


def _make_comparative_prose(word_count: int) -> str:
    """Generate related-work style prose with comparative language."""
    sentence = (
        "Unlike prior work that focuses on simple baselines, "
        "our approach differs by incorporating novel techniques. "
        "In contrast to existing methods, we address key limitations. "
        "However, while previous approaches rely on heuristics, "
        "our method provides theoretical guarantees. "
    )
    words_per = len(sentence.split())
    return sentence * (word_count // words_per + 1)


def _make_results_prose(word_count: int) -> str:
    """Generate results prose with statistical measures."""
    sentence = (
        "Our method achieves 85.3 ± 1.2 accuracy averaged over 5 seeds. "
        "The baseline comparison yields a p-value of 0.003, confirming "
        "statistical significance with 95% confidence interval. "
    )
    words_per = len(sentence.split())
    return sentence * (word_count // words_per + 1)


def _build_draft(**section_overrides: str) -> str:
    """Build a paper draft with default prose sections."""
    defaults = {
        "Abstract": _make_prose(200),
        "Introduction": _make_prose(900),
        "Related Work": _make_comparative_prose(700),
        "Method": _make_prose(1200),
        "Experiments": _make_prose(1000),
        "Results": _make_results_prose(700),
        "Discussion": _make_prose(500),
        "Limitations": _make_prose(250),
        "Conclusion": _make_prose(250),
    }
    defaults.update(section_overrides)
    parts = ["# My Research Title\n"]
    for heading, body in defaults.items():
        parts.append(f"# {heading}\n{body}\n")
    return "\n".join(parts)


class TestValidateDraftQuality:
    """Tests for _validate_draft_quality()."""

    def test_short_section_triggers_warning(self) -> None:
        """Short Method section triggers expand warning."""
        draft = _build_draft(Method=_make_prose(200))
        result = rc_executor._validate_draft_quality(draft)
        assert any("Method" in w for w in result["overall_warnings"])
        assert any("EXPAND" in d or "Expand" in d
                    for d in result["revision_directives"])

    def test_bullet_density_triggers_warning(self) -> None:
        """Bullet-heavy Method section triggers rewrite warning."""
        draft = _build_draft(Method=_make_bullets(1200))
        result = rc_executor._validate_draft_quality(draft)
        assert any(
            "bullet" in w.lower() or "density" in w.lower()
            for w in result["overall_warnings"]
        )
        assert any("REWRITE" in d for d in result["revision_directives"])

    def test_clean_draft_no_warnings(self) -> None:
        """Balanced prose draft produces zero warnings."""
        draft = _build_draft()
        result = rc_executor._validate_draft_quality(draft)
        assert len(result["overall_warnings"]) == 0
        assert len(result["revision_directives"]) == 0

    def test_balance_warning(self) -> None:
        """Large imbalance between sections triggers balance warning."""
        draft = _build_draft(
            Introduction=_make_prose(1500),
            Results=_make_prose(100),
        )
        result = rc_executor._validate_draft_quality(draft)
        bal = [w for w in result["overall_warnings"]
               if "imbalance" in w.lower()]
        assert len(bal) >= 1, (
            f"Expected balance warning, got: {result['overall_warnings']}"
        )

    def test_writes_json_to_stage_dir(self, tmp_path: Path) -> None:
        """Quality report is written as draft_quality.json."""
        draft = _build_draft(Method=_make_prose(200))
        rc_executor._validate_draft_quality(draft, stage_dir=tmp_path)
        assert (tmp_path / "draft_quality.json").exists()
        data = json.loads(
            (tmp_path / "draft_quality.json").read_text()
        )
        assert "section_analysis" in data
        assert "overall_warnings" in data
        assert "revision_directives" in data


================================================
FILE: tests/test_rc_hardware.py
================================================
"""Tests for researchclaw.hardware — GPU detection & metric filtering."""

from __future__ import annotations

import subprocess
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.hardware import (
    HardwareProfile,
    _detect_mps,
    _detect_nvidia,
    detect_hardware,
    ensure_torch_available,
    is_metric_name,
)


# ---------------------------------------------------------------------------
# HardwareProfile
# ---------------------------------------------------------------------------

class TestHardwareProfile:
    def test_to_dict(self):
        hp = HardwareProfile(
            has_gpu=True, gpu_type="cuda", gpu_name="RTX 4090",
            vram_mb=24564, tier="high", warning="",
        )
        d = hp.to_dict()
        assert d["has_gpu"] is True
        assert d["gpu_type"] == "cuda"
        assert d["vram_mb"] == 24564

    def test_cpu_only_profile(self):
        hp = HardwareProfile(
            has_gpu=False, gpu_type="cpu", gpu_name="CPU only",
            vram_mb=None, tier="cpu_only", warning="No GPU",
        )
        assert hp.tier == "cpu_only"
        assert hp.warning == "No GPU"


# ---------------------------------------------------------------------------
# NVIDIA detection
# ---------------------------------------------------------------------------

class TestDetectNvidia:
    def test_high_vram_nvidia(self):
        mock_result = MagicMock()
        mock_result.returncode = 0
        mock_result.stdout = "NVIDIA GeForce RTX 4090, 24564\n"

        with patch("researchclaw.hardware.subprocess.run", return_value=mock_result):
            profile = _detect_nvidia()

        assert profile is not None
        assert profile.has_gpu is True
        assert profile.gpu_type == "cuda"
        assert profile.gpu_name == "NVIDIA GeForce RTX 4090"
        assert profile.vram_mb == 24564
        assert profile.tier == "high"
        assert profile.warning == ""

    def test_low_vram_nvidia(self):
        mock_result = MagicMock()
        mock_result.returncode = 0
        mock_result.stdout = "NVIDIA GeForce GTX 1650, 4096\n"

        with patch("researchclaw.hardware.subprocess.run", return_value=mock_result):
            profile = _detect_nvidia()

        assert profile is not None
        assert profile.tier == "limited"
        assert "limited memory" in profile.warning

    def test_nvidia_smi_not_found(self):
        with patch(
            "researchclaw.hardware.subprocess.run",
            side_effect=FileNotFoundError,
        ):
            assert _detect_nvidia() is None

    def test_nvidia_smi_failure(self):
        mock_result = MagicMock()
        mock_result.returncode = 1

        with patch("researchclaw.hardware.subprocess.run", return_value=mock_result):
            assert _detect_nvidia() is None

    def test_nvidia_smi_timeout(self):
        with patch(
            "researchclaw.hardware.subprocess.run",
            side_effect=subprocess.TimeoutExpired("nvidia-smi", 10),
        ):
            assert _detect_nvidia() is None


# ---------------------------------------------------------------------------
# MPS detection
# ---------------------------------------------------------------------------

class TestDetectMPS:
    def test_apple_silicon(self):
        mock_result = MagicMock()
        mock_result.returncode = 0
        mock_result.stdout = "Apple M3 Pro\n"

        with (
            patch("researchclaw.hardware.platform.system", return_value="Darwin"),
            patch("researchclaw.hardware.platform.machine", return_value="arm64"),
            patch("researchclaw.hardware.subprocess.run", return_value=mock_result),
        ):
            profile = _detect_mps()

        assert profile is not None
        assert profile.has_gpu is True
        assert profile.gpu_type == "mps"
        assert profile.gpu_name == "Apple M3 Pro"
        assert profile.tier == "limited"
        assert "MPS" in profile.warning

    def test_non_darwin(self):
        with patch("researchclaw.hardware.platform.system", return_value="Linux"):
            assert _detect_mps() is None

    def test_intel_mac(self):
        with (
            patch("researchclaw.hardware.platform.system", return_value="Darwin"),
            patch("researchclaw.hardware.platform.machine", return_value="x86_64"),
        ):
            assert _detect_mps() is None


# ---------------------------------------------------------------------------
# detect_hardware (integration)
# ---------------------------------------------------------------------------

class TestDetectHardware:
    def test_falls_back_to_cpu(self):
        with (
            patch("researchclaw.hardware._detect_nvidia", return_value=None),
            patch("researchclaw.hardware._detect_mps", return_value=None),
        ):
            profile = detect_hardware()

        assert profile.has_gpu is False
        assert profile.gpu_type == "cpu"
        assert profile.tier == "cpu_only"
        assert "No GPU" in profile.warning

    def test_nvidia_takes_priority(self):
        nvidia_profile = HardwareProfile(
            has_gpu=True, gpu_type="cuda", gpu_name="RTX 4090",
            vram_mb=24564, tier="high", warning="",
        )
        mps_profile = HardwareProfile(
            has_gpu=True, gpu_type="mps", gpu_name="M3",
            vram_mb=None, tier="limited", warning="MPS",
        )
        with (
            patch("researchclaw.hardware._detect_nvidia", return_value=nvidia_profile),
            patch("researchclaw.hardware._detect_mps", return_value=mps_profile),
        ):
            profile = detect_hardware()

        assert profile.gpu_type == "cuda"


# ---------------------------------------------------------------------------
# ensure_torch_available
# ---------------------------------------------------------------------------

class TestEnsureTorchAvailable:
    def test_already_installed(self):
        mock_result = MagicMock()
        mock_result.returncode = 0
        mock_result.stdout = "2.3.0\n"

        with patch("researchclaw.hardware.subprocess.run", return_value=mock_result):
            assert ensure_torch_available("/usr/bin/python3", "cuda") is True

    def test_cpu_only_skips_install(self):
        mock_check = MagicMock()
        mock_check.returncode = 1  # not installed
        mock_check.stdout = ""

        with patch("researchclaw.hardware.subprocess.run", return_value=mock_check):
            assert ensure_torch_available("/usr/bin/python3", "cpu") is False

    def test_install_succeeds(self):
        call_count = {"n": 0}

        def side_effect(*args, **kwargs):
            call_count["n"] += 1
            mock = MagicMock()
            if call_count["n"] == 1:
                mock.returncode = 1  # import check fails
                mock.stdout = ""
            else:
                mock.returncode = 0  # pip install succeeds
                mock.stdout = ""
            return mock

        with patch("researchclaw.hardware.subprocess.run", side_effect=side_effect):
            assert ensure_torch_available("/usr/bin/python3", "cuda") is True

    def test_install_fails(self):
        mock = MagicMock()
        mock.returncode = 1
        mock.stdout = ""
        mock.stderr = "ERROR: Could not install"

        with patch("researchclaw.hardware.subprocess.run", return_value=mock):
            assert ensure_torch_available("/usr/bin/python3", "mps") is False

    def test_python_not_found(self):
        with patch(
            "researchclaw.hardware.subprocess.run",
            side_effect=FileNotFoundError,
        ):
            assert ensure_torch_available("/nonexistent/python3", "cuda") is False


# ---------------------------------------------------------------------------
# is_metric_name
# ---------------------------------------------------------------------------

class TestIsMetricName:
    def test_valid_metrics(self):
        assert is_metric_name("loss") is True
        assert is_metric_name("primary_metric") is True
        assert is_metric_name("UCB (Stochastic) cumulative_regret") is True
        assert is_metric_name("accuracy") is True
        assert is_metric_name("f1_score") is True

    def test_log_lines_filtered(self):
        assert is_metric_name("Running experiments for support set size") is False
        assert is_metric_name("Loading model weights") is False
        assert is_metric_name("Training epoch 5") is False
        assert is_metric_name("Evaluating on test set") is False
        assert is_metric_name("Processing batch") is False
        assert is_metric_name("Initializing optimizer") is False

    def test_too_many_words_filtered(self):
        assert is_metric_name("this is a very long name that has many words") is False

    def test_short_names_pass(self):
        assert is_metric_name("val_loss") is True
        assert is_metric_name("test accuracy score") is True


================================================
FILE: tests/test_rc_health.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false, reportMissingImports=false, reportUntypedNamedTuple=false, reportMissingTypeArgument=false, reportArgumentType=false
from __future__ import annotations

import json
import socket
import urllib.error
from pathlib import Path
from typing import NamedTuple, cast
from unittest.mock import patch

import pytest

from researchclaw import health


class _VersionInfo(NamedTuple):
    major: int
    minor: int
    micro: int
    releaselevel: str
    serial: int


class _DummyHTTPResponse:
    status: int
    _payload: dict[str, object]

    def __init__(
        self, *, status: int = 200, payload: dict[str, object] | None = None
    ) -> None:
        self.status = status
        self._payload = payload if payload is not None else {}

    def read(self) -> bytes:
        return json.dumps(self._payload).encode("utf-8")

    def __enter__(self) -> _DummyHTTPResponse:
        return self

    def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
        return None


def _write_valid_config(path: Path) -> None:
    _ = path.write_text(
        """
project:
  name: demo
research:
  topic: Doctor checks
runtime:
  timezone: UTC
notifications:
  channel: test
knowledge_base:
  root: kb
llm:
  base_url: https://api.example.com/v1
  api_key_env: OPENAI_API_KEY
""".strip()
        + "\n",
        encoding="utf-8",
    )


def test_check_python_version_pass() -> None:
    with patch("sys.version_info", _VersionInfo(3, 11, 0, "final", 0)):
        result = health.check_python_version()
    assert result.status == "pass"


def test_check_python_version_fail() -> None:
    with patch("sys.version_info", _VersionInfo(3, 10, 9, "final", 0)):
        result = health.check_python_version()
    assert result.status == "fail"
    assert "Install Python 3.11 or newer" == result.fix


def test_check_yaml_import_pass() -> None:
    with patch("importlib.import_module", return_value=object()):
        result = health.check_yaml_import()
    assert result.status == "pass"


def test_check_yaml_import_fail() -> None:
    with patch("importlib.import_module", side_effect=ImportError):
        result = health.check_yaml_import()
    assert result.status == "fail"
    assert result.fix == "pip install pyyaml"


def test_check_config_valid_pass(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _write_valid_config(config_path)
    result = health.check_config_valid(config_path)
    assert result.status == "pass"


def test_check_config_invalid(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _ = config_path.write_text("project: {}\n", encoding="utf-8")
    result = health.check_config_valid(config_path)
    assert result.status == "fail"
    assert "Missing required field:" in result.detail


def test_check_config_missing_file(tmp_path: Path) -> None:
    result = health.check_config_valid(tmp_path / "missing.yaml")
    assert result.status == "fail"
    assert "Config file not found" in result.detail


def test_check_llm_connectivity_pass() -> None:
    with patch("urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200)):
        result = health.check_llm_connectivity("https://api.example.com/v1")
    assert result.status == "pass"


def test_check_llm_connectivity_timeout() -> None:
    with patch(
        "urllib.request.urlopen",
        side_effect=urllib.error.URLError(socket.timeout("timed out")),
    ):
        result = health.check_llm_connectivity("https://api.example.com/v1")
    assert result.status == "fail"
    assert result.detail == "LLM endpoint unreachable"


def test_check_llm_connectivity_http_error() -> None:
    with patch(
        "urllib.request.urlopen",
        side_effect=urllib.error.HTTPError(
            "https://api.example.com/v1/models", 503, "unavailable", {}, None
        ),
    ):
        result = health.check_llm_connectivity("https://api.example.com/v1")
    assert result.status == "fail"
    assert "503" in result.detail


def test_check_api_key_valid() -> None:
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload={"data": []}),
    ):
        result = health.check_api_key_valid("https://api.example.com/v1", "sk-test")
    assert result.status == "pass"


def test_check_api_key_invalid_401() -> None:
    with patch(
        "urllib.request.urlopen",
        side_effect=urllib.error.HTTPError(
            "https://api.example.com/v1/models", 401, "unauthorized", {}, None
        ),
    ):
        result = health.check_api_key_valid("https://api.example.com/v1", "bad")
    assert result.status == "fail"
    assert result.detail == "Invalid API key"


def test_check_model_available_pass() -> None:
    payload = {"data": [{"id": "gpt-5.2"}, {"id": "gpt-4o"}]}
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload=payload),
    ):
        result = health.check_model_available(
            "https://api.example.com/v1", "sk-test", "gpt-5.2"
        )
    assert result.status == "pass"


def test_check_model_not_available() -> None:
    payload = {"data": [{"id": "gpt-4o"}]}
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload=payload),
    ):
        result = health.check_model_available(
            "https://api.example.com/v1", "sk-test", "gpt-5.2"
        )
    assert result.status == "fail"
    assert result.detail == "Model gpt-5.2 not available"


def test_check_model_chain_all_available() -> None:
    payload = {"data": [{"id": "gpt-4o"}, {"id": "gpt-4.1"}]}
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload=payload),
    ):
        result = health.check_model_chain(
            "https://api.example.com/v1", "sk-test", "gpt-4o", ("gpt-4.1",)
        )
    assert result.status == "pass"
    assert "All models available" in result.detail


def test_check_model_chain_primary_missing_fallback_ok() -> None:
    payload = {"data": [{"id": "gpt-4.1"}, {"id": "gpt-4o-mini"}]}
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload=payload),
    ):
        result = health.check_model_chain(
            "https://api.example.com/v1", "sk-test",
            "gpt-5.2", ("gpt-4.1", "gpt-4o-mini")
        )
    assert result.status == "pass"
    assert "unavailable" in result.detail
    assert "gpt-5.2" in result.detail


def test_check_model_chain_all_missing() -> None:
    payload = {"data": [{"id": "gpt-4o"}]}
    with patch(
        "urllib.request.urlopen",
        return_value=_DummyHTTPResponse(status=200, payload=payload),
    ):
        result = health.check_model_chain(
            "https://api.example.com/v1", "sk-test",
            "gpt-5.2", ("gpt-5.1",)
        )
    assert result.status == "fail"
    assert "No models available" in result.detail


def test_check_model_chain_no_models() -> None:
    result = health.check_model_chain(
        "https://api.example.com/v1", "sk-test", "", ()
    )
    assert result.status == "warn"
    assert "No models configured" in result.detail


def test_check_sandbox_python_exists() -> None:
    with (
        patch.object(Path, "exists", return_value=True),
        patch("os.access", return_value=True),
    ):
        result = health.check_sandbox_python(".venv_arc/bin/python3")
    assert result.status == "pass"


def test_check_sandbox_python_missing() -> None:
    with (
        patch.object(Path, "exists", return_value=False),
        patch("os.access", return_value=False),
    ):
        result = health.check_sandbox_python(".venv_arc/bin/python3")
    assert result.status == "warn"


def test_check_matplotlib_available() -> None:
    with patch("importlib.import_module", return_value=object()):
        result = health.check_matplotlib()
    assert result.status == "pass"


def test_check_matplotlib_missing() -> None:
    with patch("importlib.import_module", side_effect=ImportError):
        result = health.check_matplotlib()
    assert result.status == "warn"
    assert result.detail == "Not installed; charts will be skipped"


def test_check_experiment_mode_simulated() -> None:
    result = health.check_experiment_mode("simulated")
    assert result.status == "warn"


def test_check_experiment_mode_sandbox() -> None:
    result = health.check_experiment_mode("sandbox")
    assert result.status == "pass"


def test_run_doctor_all_pass_openai(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _ = config_path.write_text("project: {}\n", encoding="utf-8")
    with (
        patch.object(
            health,
            "check_python_version",
            return_value=health.CheckResult("python_version", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_yaml_import",
            return_value=health.CheckResult("yaml_import", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_config_valid",
            return_value=health.CheckResult("config_valid", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_llm_connectivity",
            return_value=health.CheckResult("llm_connectivity", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_api_key_valid",
            return_value=health.CheckResult("api_key_valid", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_model_chain",
            return_value=health.CheckResult("model_chain", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_sandbox_python",
            return_value=health.CheckResult("sandbox_python", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_matplotlib",
            return_value=health.CheckResult("matplotlib", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_experiment_mode",
            return_value=health.CheckResult("experiment_mode", "pass", "ok"),
        ),
    ):
        report = health.run_doctor(config_path)
    assert report.overall == "pass"
    assert len(report.checks) == 9


def test_run_doctor_with_failures(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _ = config_path.write_text("project: {}\n", encoding="utf-8")
    with (
        patch.object(
            health,
            "check_python_version",
            return_value=health.CheckResult("python_version", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_yaml_import",
            return_value=health.CheckResult("yaml_import", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_config_valid",
            return_value=health.CheckResult("config_valid", "fail", "bad", "fix it"),
        ),
        patch.object(
            health,
            "check_llm_connectivity",
            return_value=health.CheckResult("llm_connectivity", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_api_key_valid",
            return_value=health.CheckResult("api_key_valid", "warn", "warn", "later"),
        ),
        patch.object(
            health,
            "check_model_chain",
            return_value=health.CheckResult("model_chain", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_sandbox_python",
            return_value=health.CheckResult("sandbox_python", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_matplotlib",
            return_value=health.CheckResult("matplotlib", "pass", "ok"),
        ),
        patch.object(
            health,
            "check_experiment_mode",
            return_value=health.CheckResult("experiment_mode", "pass", "ok"),
        ),
    ):
        report = health.run_doctor(config_path)
    assert report.overall == "fail"
    assert "fix it" in report.actionable_fixes


def test_doctor_report_json_structure(tmp_path: Path) -> None:
    report = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[
            health.CheckResult("python_version", "pass", "ok"),
            health.CheckResult(
                "matplotlib", "warn", "missing", "pip install matplotlib"
            ),
        ],
        overall="pass",
    )
    output_path = tmp_path / "reports" / "doctor.json"
    health.write_doctor_report(report, output_path)

    raw = cast(dict[str, object], json.loads(output_path.read_text(encoding="utf-8")))
    assert raw["timestamp"] == "2026-01-01T00:00:00+00:00"
    assert raw["overall"] == "pass"
    assert isinstance(raw["checks"], list)
    assert raw["actionable_fixes"] == ["pip install matplotlib"]


def test_doctor_report_overall_logic() -> None:
    passing = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[health.CheckResult("x", "pass", "ok")],
        overall="pass",
    )
    failing = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[health.CheckResult("x", "fail", "bad", "fix")],
        overall="fail",
    )

    assert passing.overall == "pass"
    assert failing.overall == "fail"
    assert failing.actionable_fixes == ["fix"]


def test_print_doctor_report_pass(capsys: pytest.CaptureFixture[str]) -> None:
    report = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[health.CheckResult("python_version", "pass", "ok")],
        overall="pass",
    )
    health.print_doctor_report(report)
    out = capsys.readouterr().out
    assert "✅" in out
    assert "Result: PASS" in out


def test_print_doctor_report_fail(capsys: pytest.CaptureFixture[str]) -> None:
    report = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[
            health.CheckResult("config_valid", "fail", "bad config", "fix config"),
            health.CheckResult(
                "matplotlib", "warn", "missing", "pip install matplotlib"
            ),
        ],
        overall="fail",
    )
    health.print_doctor_report(report)
    out = capsys.readouterr().out
    assert "❌" in out
    assert "⚠️" in out
    assert "Result: FAIL (1 errors, 1 warnings)" in out


# --- ACP agent checks ---


def test_check_acp_agent_found() -> None:
    with patch("shutil.which", return_value="/usr/local/bin/claude"):
        result = health.check_acp_agent("claude")
    assert result.status == "pass"
    assert "/usr/local/bin/claude" in result.detail


def test_check_acp_agent_missing() -> None:
    with patch("shutil.which", return_value=None):
        result = health.check_acp_agent("claude")
    assert result.status == "fail"
    assert "'claude' not found" in result.detail
    assert "Install claude" in result.fix


def _write_acp_config(path: Path) -> None:
    _ = path.write_text(
        """\
project:
  name: demo
research:
  topic: ACP test
runtime:
  timezone: UTC
notifications:
  channel: test
knowledge_base:
  root: kb
llm:
  provider: acp
  acp:
    agent: claude
""",
        encoding="utf-8",
    )


def test_run_doctor_acp_skips_http_checks(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _write_acp_config(config_path)
    with (
        patch.object(
            health, "check_python_version",
            return_value=health.CheckResult("python_version", "pass", "ok"),
        ),
        patch.object(
            health, "check_yaml_import",
            return_value=health.CheckResult("yaml_import", "pass", "ok"),
        ),
        patch.object(
            health, "check_config_valid",
            return_value=health.CheckResult("config_valid", "pass", "ok"),
        ),
        patch.object(
            health, "check_acp_agent",
            return_value=health.CheckResult("acp_agent", "pass", "ok"),
        ),
        patch.object(
            health, "check_sandbox_python",
            return_value=health.CheckResult("sandbox_python", "pass", "ok"),
        ),
        patch.object(
            health, "check_matplotlib",
            return_value=health.CheckResult("matplotlib", "pass", "ok"),
        ),
        patch.object(
            health, "check_experiment_mode",
            return_value=health.CheckResult("experiment_mode", "pass", "ok"),
        ),
    ):
        report = health.run_doctor(config_path)

    check_names = [c.name for c in report.checks]
    assert "llm_connectivity" not in check_names
    assert "api_key_valid" not in check_names
    assert "model_chain" not in check_names


def test_run_doctor_acp_includes_agent_check(tmp_path: Path) -> None:
    config_path = tmp_path / "config.yaml"
    _write_acp_config(config_path)
    with (
        patch.object(
            health, "check_python_version",
            return_value=health.CheckResult("python_version", "pass", "ok"),
        ),
        patch.object(
            health, "check_yaml_import",
            return_value=health.CheckResult("yaml_import", "pass", "ok"),
        ),
        patch.object(
            health, "check_config_valid",
            return_value=health.CheckResult("config_valid", "pass", "ok"),
        ),
        patch.object(
            health, "check_acp_agent",
            return_value=health.CheckResult("acp_agent", "pass", "ok"),
        ),
        patch.object(
            health, "check_sandbox_python",
            return_value=health.CheckResult("sandbox_python", "pass", "ok"),
        ),
        patch.object(
            health, "check_matplotlib",
            return_value=health.CheckResult("matplotlib", "pass", "ok"),
        ),
        patch.object(
            health, "check_experiment_mode",
            return_value=health.CheckResult("experiment_mode", "pass", "ok"),
        ),
    ):
        report = health.run_doctor(config_path)

    check_names = [c.name for c in report.checks]
    assert "acp_agent" in check_names
    assert report.overall == "pass"
    assert len(report.checks) == 7


def test_print_doctor_report_ascii_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
    report = health.DoctorReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=[health.CheckResult("python_version", "pass", "ok")],
        overall="pass",
    )

    class _AsciiStdout:
        encoding = "ascii"

        def __init__(self) -> None:
            self.parts: list[str] = []

        def write(self, text: str) -> int:
            text.encode(self.encoding)
            self.parts.append(text)
            return len(text)

        def flush(self) -> None:
            return None

    fake_stdout = _AsciiStdout()
    monkeypatch.setattr(health.sys, "stdout", fake_stdout)

    health.print_doctor_report(report)

    out = "".join(fake_stdout.parts)
    assert "[OK] python_version: ok" in out
    assert "Result: PASS" in out


================================================
FILE: tests/test_rc_kb.py
================================================
from __future__ import annotations

import json
from pathlib import Path

import yaml

from researchclaw.knowledge.base import (
    KB_CATEGORY_MAP,
    KBEntry,
    _markdown_frontmatter,
    _obsidian_enhancements,
    generate_weekly_report,
    write_kb_entry,
    write_stage_to_kb,
)


def _kb_root(tmp_path: Path) -> Path:
    return tmp_path / "kb"


def test_kb_entry_dataclass_creation():
    entry = KBEntry(
        category="findings",
        entry_id="e1",
        title="T",
        content="C",
        source_stage="01-goal_define",
        run_id="run1",
    )
    assert entry.category == "findings"
    assert entry.entry_id == "e1"
    assert entry.run_id == "run1"


def test_write_kb_entry_creates_expected_file_path(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    entry = KBEntry("questions", "q-1", "Q", "Body", "01-goal_define", "run-a")
    path = write_kb_entry(kb_root, entry)
    assert path == kb_root / "questions" / "q-1.md"
    assert path.exists()


def test_write_kb_entry_includes_frontmatter_markers(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    entry = KBEntry("findings", "f-1", "Finding", "Body", "14-result_analysis", "run-a")
    text = write_kb_entry(kb_root, entry).read_text(encoding="utf-8")
    assert text.startswith("---\n")
    assert "\n---\n" in text


def test_write_kb_entry_markdown_backend_has_no_obsidian_extras(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    entry = KBEntry(
        "questions",
        "q-2",
        "Question",
        "Body",
        "01-goal_define",
        "run-a",
        tags=["hypothesis"],
        links=["run-run-a"],
    )
    text = write_kb_entry(kb_root, entry, backend="markdown").read_text(
        encoding="utf-8"
    )
    assert "[[run-run-a]]" not in text
    assert "#hypothesis" not in text


def test_write_kb_entry_obsidian_backend_includes_tags_and_wikilinks(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    entry = KBEntry(
        "questions",
        "q-3",
        "Question",
        "Body",
        "01-goal_define",
        "run-a",
        tags=["hypothesis", "q1"],
        links=["run-run-a", "topic-a"],
    )
    text = write_kb_entry(kb_root, entry, backend="obsidian").read_text(
        encoding="utf-8"
    )
    assert "#hypothesis #q1" in text
    assert "Related: [[run-run-a]], [[topic-a]]" in text


def test_markdown_frontmatter_output_format_and_fields():
    entry = KBEntry(
        "reviews",
        "r-1",
        "Report",
        "Body",
        "report",
        "run-x",
        tags=["weekly"],
        evidence_refs=["stage-01/goal.md"],
    )
    fm = _markdown_frontmatter(entry)
    assert fm.startswith("---\n")
    assert fm.endswith("\n---\n")
    parsed = yaml.safe_load(fm.split("---\n", 1)[1].rsplit("\n---\n", 1)[0])
    assert parsed["id"] == "r-1"
    assert parsed["title"] == "Report"
    assert parsed["stage"] == "report"
    assert parsed["run_id"] == "run-x"
    assert parsed["tags"] == ["weekly"]
    assert parsed["evidence"] == ["stage-01/goal.md"]


def test_obsidian_enhancements_with_tags_and_links():
    entry = KBEntry(
        "findings",
        "f-2",
        "Finding",
        "Body",
        "14-result_analysis",
        "run-z",
        tags=["a", "b"],
        links=["run-z", "result-node"],
    )
    enh = _obsidian_enhancements(entry)
    assert "#a #b" in enh
    assert "Related: [[run-z]], [[result-node]]" in enh


def test_obsidian_enhancements_with_no_tags_or_links_returns_empty():
    entry = KBEntry("findings", "f-3", "Finding", "Body", "14-result_analysis", "run-z")
    assert _obsidian_enhancements(entry) == ""


def test_kb_category_map_has_exactly_22_stage_entries():
    assert len(KB_CATEGORY_MAP) == 22
    assert set(KB_CATEGORY_MAP) == set(range(1, 23))


def test_kb_category_map_values_are_valid_categories():
    valid = {
        "questions",
        "literature",
        "experiments",
        "findings",
        "decisions",
        "reviews",
    }
    assert set(KB_CATEGORY_MAP.values()).issubset(valid)


def test_write_stage_to_kb_places_entry_in_mapped_category(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    stage_dir = tmp_path / "stage-10"
    stage_dir.mkdir()
    (stage_dir / "run.md").write_text("exp content", encoding="utf-8")
    paths = write_stage_to_kb(
        kb_root, 10, "experiment_cycle", "run-1", ["run.md"], stage_dir
    )
    assert len(paths) == 1
    assert paths[0].parent.name == "experiments"


def test_write_stage_to_kb_reads_artifact_file_contents(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    stage_dir = tmp_path / "stage-04"
    stage_dir.mkdir()
    (stage_dir / "lit.md").write_text("paper A\npaper B", encoding="utf-8")
    path = write_stage_to_kb(
        kb_root, 4, "literature_search", "run-1", ["lit.md"], stage_dir
    )[0]
    text = path.read_text(encoding="utf-8")
    assert "paper A" in text
    assert "stage-04/lit.md" in text


def test_write_stage_to_kb_handles_missing_artifacts_gracefully(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    stage_dir = tmp_path / "stage-05"
    stage_dir.mkdir()
    path = write_stage_to_kb(
        kb_root, 5, "literature_extract", "run-2", ["missing.md"], stage_dir
    )[0]
    text = path.read_text(encoding="utf-8")
    assert "Stage 05 (literature_extract) completed" in text


def test_write_stage_to_kb_truncates_large_artifact_content(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    stage_dir = tmp_path / "stage-12"
    stage_dir.mkdir()
    large_text = "x" * 6000
    (stage_dir / "big.txt").write_text(large_text, encoding="utf-8")
    path = write_stage_to_kb(
        kb_root, 12, "experiment_implement", "run-3", ["big.txt"], stage_dir
    )[0]
    text = path.read_text(encoding="utf-8")
    assert "... (truncated, see full artifact)" in text
    assert text.count("x") >= 5000


def test_write_stage_to_kb_directory_artifact_records_listing(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    stage_dir = tmp_path / "stage-13"
    artifact_dir = stage_dir / "outputs"
    artifact_dir.mkdir(parents=True)
    (artifact_dir / "a.txt").write_text("a", encoding="utf-8")
    (artifact_dir / "b.txt").write_text("b", encoding="utf-8")
    path = write_stage_to_kb(
        kb_root, 13, "experiment_execute", "run-4", ["outputs/"], stage_dir
    )[0]
    text = path.read_text(encoding="utf-8")
    assert "Directory with 2 files: a.txt, b.txt" in text
    assert "stage-13/outputs/" in text


def test_generate_weekly_report_creates_file_in_reviews_category(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    run_dir = tmp_path / "run-a"
    run_dir.mkdir()
    (run_dir / "pipeline_summary.json").write_text(
        json.dumps({"run_id": "run-a", "stages_executed": 10, "stages_done": 10}),
        encoding="utf-8",
    )
    path = generate_weekly_report(kb_root, [run_dir], week_label="2026-W10")
    assert path.parent.name == "reviews"
    assert path.name == "weekly-report-2026-W10.md"


def test_generate_weekly_report_with_empty_run_dirs(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    path = generate_weekly_report(kb_root, [], week_label="2026-W11")
    text = path.read_text(encoding="utf-8")
    assert "Pipeline runs: 0" in text
    assert "Success rate: N/A" in text


def test_generate_weekly_report_aggregates_statistics_correctly(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    run1 = tmp_path / "run-1"
    run2 = tmp_path / "run-2"
    run1.mkdir()
    run2.mkdir()
    (run1 / "pipeline_summary.json").write_text(
        json.dumps(
            {
                "run_id": "run-1",
                "stages_executed": 20,
                "stages_done": 18,
                "stages_failed": 1,
                "stages_blocked": 1,
                "final_status": "failed",
            }
        ),
        encoding="utf-8",
    )
    (run2 / "pipeline_summary.json").write_text(
        json.dumps(
            {
                "run_id": "run-2",
                "stages_executed": 10,
                "stages_done": 10,
                "stages_failed": 0,
                "stages_blocked": 0,
                "final_status": "done",
            }
        ),
        encoding="utf-8",
    )
    report = generate_weekly_report(kb_root, [run1, run2], week_label="2026-W12")
    text = report.read_text(encoding="utf-8")
    assert "Pipeline runs: 2" in text
    assert "Stages executed: 30" in text
    assert "Stages completed: 28" in text
    assert "Stages failed: 1" in text
    assert "Stages blocked (gate): 1" in text
    assert "Success rate: 93.3%" in text


def test_generate_weekly_report_ignores_missing_summary_files(tmp_path: Path):
    kb_root = _kb_root(tmp_path)
    run_ok = tmp_path / "run-ok"
    run_empty = tmp_path / "run-empty"
    run_ok.mkdir()
    run_empty.mkdir()
    (run_ok / "pipeline_summary.json").write_text(
        json.dumps({"run_id": "run-ok", "stages_executed": 5, "stages_done": 5}),
        encoding="utf-8",
    )
    report = generate_weekly_report(kb_root, [run_ok, run_empty], week_label="2026-W13")
    text = report.read_text(encoding="utf-8")
    assert "Pipeline runs: 1" in text


================================================
FILE: tests/test_rc_literature.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false
"""Unit tests for researchclaw.literature module.

All network-dependent tests mock HTTP responses via monkeypatch.
"""

from __future__ import annotations

import json
import textwrap
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.literature.models import Author, Paper
from researchclaw.literature.semantic_scholar import (
    _parse_s2_paper,
    search_semantic_scholar,
)
from researchclaw.literature.arxiv_client import (
    _convert_result,
    search_arxiv,
)
from researchclaw.literature.search import (
    _deduplicate,
    _normalise_title,
    papers_to_bibtex,
    search_papers,
    search_papers_multi_query,
)


# ──────────────────────────────────────────────────────────────────────
# Fixtures & helpers
# ──────────────────────────────────────────────────────────────────────


def _make_paper(**kwargs: Any) -> Paper:
    defaults = {
        "paper_id": "s2-abc",
        "title": "Attention Is All You Need",
        "authors": (Author(name="Ashish Vaswani"),),
        "year": 2017,
        "venue": "NeurIPS",
        "citation_count": 80000,
        "doi": "10.5555/3295222.3295349",
        "arxiv_id": "1706.03762",
        "url": "https://arxiv.org/abs/1706.03762",
        "source": "semantic_scholar",
    }
    defaults.update(kwargs)
    return Paper(**defaults)


SAMPLE_S2_RESPONSE = {
    "total": 1,
    "data": [
        {
            "paperId": "abc123",
            "title": "Test Paper on Transformers",
            "abstract": "We study transformers for NLP tasks.",
            "year": 2024,
            "venue": "NeurIPS",
            "citationCount": 42,
            "authors": [
                {"authorId": "1", "name": "Jane Smith"},
                {"authorId": "2", "name": "John Doe"},
            ],
            "externalIds": {"DOI": "10.1234/test", "ArXiv": "2401.00001"},
            "url": "https://www.semanticscholar.org/paper/abc123",
        }
    ],
}


SAMPLE_ARXIV_ATOM = textwrap.dedent("""\
    <?xml version="1.0" encoding="UTF-8"?>
    <feed xmlns="http://www.w3.org/2005/Atom"
          xmlns:arxiv="http://arxiv.org/schemas/atom">
      <entry>
        <id>http://arxiv.org/abs/2401.00001v1</id>
        <title>A Novel Approach to Protein Folding</title>
        <summary>We propose a new method for protein structure prediction.</summary>
        <published>2024-01-15T00:00:00Z</published>
        <author><name>Alice Researcher</name></author>
        <author><name>Bob Scientist</name></author>
        <link href="http://arxiv.org/abs/2401.00001v1" type="text/html"/>
        <arxiv:primary_category term="cs.AI"/>
        <arxiv:doi>10.5678/protein</arxiv:doi>
      </entry>
      <entry>
        <id>http://arxiv.org/abs/2402.00002v1</id>
        <title>Deep Reinforcement Learning Survey</title>
        <summary>A comprehensive survey of deep RL methods.</summary>
        <published>2024-02-20T00:00:00Z</published>
        <author><name>Charlie Expert</name></author>
        <link href="http://arxiv.org/abs/2402.00002v1" type="text/html"/>
        <arxiv:primary_category term="cs.LG"/>
      </entry>
    </feed>
""")


# ──────────────────────────────────────────────────────────────────────
# Author tests
# ──────────────────────────────────────────────────────────────────────


class TestAuthor:
    def test_last_name_simple(self) -> None:
        a = Author(name="Jane Smith")
        assert a.last_name() == "smith"

    def test_last_name_accented(self) -> None:
        a = Author(name="José García")
        assert a.last_name() == "garcia"  # accent stripped, but 'i' preserved

    def test_last_name_single(self) -> None:
        a = Author(name="Madonna")
        assert a.last_name() == "madonna"

    def test_last_name_empty(self) -> None:
        a = Author(name="")
        assert a.last_name() == "unknown"


# ──────────────────────────────────────────────────────────────────────
# Paper tests
# ──────────────────────────────────────────────────────────────────────


class TestPaper:
    def test_cite_key_format(self) -> None:
        p = _make_paper()
        key = p.cite_key
        assert key == "vaswani2017attention"

    def test_cite_key_no_authors(self) -> None:
        p = _make_paper(authors=())
        assert p.cite_key.startswith("anon")

    def test_cite_key_no_year(self) -> None:
        p = _make_paper(year=0)
        assert "0000" in p.cite_key

    def test_to_bibtex_contains_fields(self) -> None:
        p = _make_paper()
        bib = p.to_bibtex()
        assert "@inproceedings{vaswani2017attention," in bib
        assert "title = {Attention Is All You Need}" in bib
        assert "author = {Ashish Vaswani}" in bib
        assert "year = {2017}" in bib
        assert "doi = {10.5555/3295222.3295349}" in bib
        assert "eprint = {1706.03762}" in bib

    def test_to_bibtex_override(self) -> None:
        p = _make_paper(_bibtex_override="@article{custom, title={Custom}}")
        assert p.to_bibtex() == "@article{custom, title={Custom}}"

    def test_to_bibtex_article_no_venue(self) -> None:
        p = _make_paper(venue="", arxiv_id="2301.00001")
        bib = p.to_bibtex()
        assert "@article{" in bib
        assert "journal = {arXiv preprint arXiv:2301.00001}" in bib

    def test_to_bibtex_arxiv_category_venue(self) -> None:
        """T1.4: arXiv category codes (cs.CL) must not be used as journal names."""
        p = _make_paper(venue="cs.CL", arxiv_id="2301.00001")
        bib = p.to_bibtex()
        assert "journal = {cs.CL}" not in bib
        assert "arXiv preprint" in bib

    def test_to_dict(self) -> None:
        p = _make_paper()
        d = p.to_dict()
        assert d["paper_id"] == "s2-abc"
        assert d["cite_key"] == "vaswani2017attention"
        assert isinstance(d["authors"], list)
        assert d["authors"][0]["name"] == "Ashish Vaswani"

    def test_paper_frozen(self) -> None:
        p = _make_paper()
        with pytest.raises(AttributeError):
            p.title = "new title"  # type: ignore[misc]


# ──────────────────────────────────────────────────────────────────────
# Semantic Scholar client tests
# ──────────────────────────────────────────────────────────────────────


class TestSemanticScholar:
    def test_parse_s2_paper(self) -> None:
        item = SAMPLE_S2_RESPONSE["data"][0]
        p = _parse_s2_paper(item)
        assert p.paper_id == "s2-abc123"
        assert p.title == "Test Paper on Transformers"
        assert len(p.authors) == 2
        assert p.authors[0].name == "Jane Smith"
        assert p.year == 2024
        assert p.doi == "10.1234/test"
        assert p.arxiv_id == "2401.00001"
        assert p.source == "semantic_scholar"
        assert p.citation_count == 42

    def test_search_semantic_scholar_mock(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """Mock urllib to return sample S2 response."""
        # Reset S2 circuit breaker (may be tripped from prior test API calls)
        from researchclaw.literature.semantic_scholar import _reset_circuit_breaker
        _reset_circuit_breaker()

        response_bytes = json.dumps(SAMPLE_S2_RESPONSE).encode("utf-8")

        mock_resp = MagicMock()
        mock_resp.read.return_value = response_bytes
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        monkeypatch.setattr(
            "researchclaw.literature.semantic_scholar.urllib.request.urlopen",
            lambda *a, **kw: mock_resp,
        )

        papers = search_semantic_scholar("transformers", limit=5)
        assert len(papers) == 1
        assert papers[0].title == "Test Paper on Transformers"

    def test_search_semantic_scholar_network_error(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """Should return empty list on network error."""
        from researchclaw.literature.semantic_scholar import _reset_circuit_breaker
        _reset_circuit_breaker()

        import urllib.error

        monkeypatch.setattr(
            "researchclaw.literature.semantic_scholar.urllib.request.urlopen",
            lambda *a, **kw: (_ for _ in ()).throw(urllib.error.URLError("timeout")),
        )
        # Patch sleep to speed up test
        monkeypatch.setattr(
            "researchclaw.literature.semantic_scholar.time.sleep", lambda _: None
        )
        papers = search_semantic_scholar("test", limit=5)
        assert papers == []


# ──────────────────────────────────────────────────────────────────────
# arXiv client tests
# ──────────────────────────────────────────────────────────────────────


class TestArxiv:
    def test_convert_result(self) -> None:
        """Test converting arxiv.Result to Paper via the new library."""
        from unittest.mock import MagicMock
        from datetime import datetime

        mock_result = MagicMock()
        mock_result.entry_id = "http://arxiv.org/abs/2401.00001v1"
        mock_result.title = "A Novel Approach to Protein Folding"
        mock_result.summary = "We study protein folding."
        mock_result.published = datetime(2024, 1, 15)
        mock_result.doi = "10.5678/protein"
        mock_result.primary_category = "q-bio.BM"

        mock_author1 = MagicMock()
        mock_author1.name = "Alice Researcher"
        mock_author2 = MagicMock()
        mock_author2.name = "Bob Scientist"
        mock_result.authors = [mock_author1, mock_author2]

        paper = _convert_result(mock_result)
        assert paper.title == "A Novel Approach to Protein Folding"
        assert paper.arxiv_id == "2401.00001"
        assert paper.year == 2024
        assert len(paper.authors) == 2
        assert paper.authors[0].name == "Alice Researcher"
        assert paper.source == "arxiv"
        assert paper.doi == "10.5678/protein"
        assert paper.venue == "q-bio.BM"

    def test_search_arxiv_mock(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Test search_arxiv with mocked arxiv library."""
        from unittest.mock import MagicMock
        from datetime import datetime
        import types

        mock_result = MagicMock()
        mock_result.entry_id = "http://arxiv.org/abs/2401.00001v1"
        mock_result.title = "Test Paper"
        mock_result.summary = "Abstract."
        mock_result.published = datetime(2024, 1, 1)
        mock_result.doi = ""
        mock_result.primary_category = "cs.LG"
        mock_author = MagicMock()
        mock_author.name = "Test Author"
        mock_result.authors = [mock_author]

        mock_client = MagicMock()
        mock_client.results.return_value = iter([mock_result])

        # Mock the module-level `arxiv` so the `if arxiv is None` guard
        # doesn't short-circuit before the mocked _get_client is reached.
        # Use MagicMock so all attributes (Search, SortOrder, etc.) auto-resolve.
        _fake_arxiv = MagicMock()
        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv,
        )
        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client._get_client",
            lambda: mock_client,
        )
        from researchclaw.literature.arxiv_client import _reset_circuit_breaker
        _reset_circuit_breaker()

        papers = search_arxiv("test", limit=10)
        assert len(papers) == 1
        assert papers[0].title == "Test Paper"
        assert papers[0].arxiv_id == "2401.00001"

    def test_search_arxiv_error_graceful(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """search_arxiv returns empty list on error, not raise."""
        from unittest.mock import MagicMock
        import types

        # Build a fake arxiv module with real exception classes so
        # `except arxiv.HTTPError` doesn't TypeError.
        _fake_arxiv = types.ModuleType("arxiv")

        class _FakeHTTPError(Exception):
            pass

        class _FakeUnexpectedEmptyPageError(Exception):
            pass

        _fake_arxiv.HTTPError = _FakeHTTPError
        _fake_arxiv.UnexpectedEmptyPageError = _FakeUnexpectedEmptyPageError
        _fake_arxiv.SortCriterion = MagicMock()
        _fake_arxiv.SortOrder = MagicMock()
        _fake_arxiv.Search = MagicMock()
        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv,
        )

        mock_client = MagicMock()
        mock_client.results.side_effect = _FakeHTTPError("Simulated arXiv HTTP error")

        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client._get_client",
            lambda: mock_client,
        )
        from researchclaw.literature.arxiv_client import _reset_circuit_breaker
        _reset_circuit_breaker()

        papers = search_arxiv("test", limit=10)
        assert papers == []


# ──────────────────────────────────────────────────────────────────────
# Unified search & deduplication tests
# ──────────────────────────────────────────────────────────────────────


class TestDeduplication:
    def test_dedup_by_doi(self) -> None:
        p1 = _make_paper(paper_id="s2-1", doi="10.1234/a", citation_count=100)
        p2 = _make_paper(
            paper_id="arxiv-1", doi="10.1234/a", citation_count=50, source="arxiv"
        )
        result = _deduplicate([p1, p2])
        assert len(result) == 1
        assert result[0].citation_count == 100  # keeps higher

    def test_dedup_by_arxiv_id(self) -> None:
        p1 = _make_paper(
            paper_id="s2-1", doi="", arxiv_id="2401.00001", citation_count=10
        )
        p2 = _make_paper(
            paper_id="arxiv-1",
            doi="",
            arxiv_id="2401.00001",
            citation_count=20,
            source="arxiv",
        )
        result = _deduplicate([p1, p2])
        assert len(result) == 1
        assert result[0].citation_count == 20  # arxiv version had more

    def test_dedup_by_title(self) -> None:
        p1 = _make_paper(
            paper_id="s2-1",
            doi="",
            arxiv_id="",
            title="My Cool Paper",
            citation_count=5,
        )
        p2 = _make_paper(
            paper_id="s2-2",
            doi="",
            arxiv_id="",
            title="My Cool Paper",
            citation_count=10,
        )
        result = _deduplicate([p1, p2])
        assert len(result) == 1
        assert result[0].citation_count == 10

    def test_dedup_no_duplicates(self) -> None:
        p1 = _make_paper(paper_id="s2-1", title="Paper A", doi="10.1/a", arxiv_id="1111.11111")
        p2 = _make_paper(paper_id="s2-2", title="Paper B", doi="10.1/b", arxiv_id="2222.22222")
        result = _deduplicate([p1, p2])
        assert len(result) == 2

    def test_normalise_title(self) -> None:
        assert _normalise_title("  The Great Paper!!! ") == "the great paper"
        assert _normalise_title("A/B Testing: Methods") == "ab testing methods"


class TestPapersToBibtex:
    def test_generates_combined(self) -> None:
        p1 = _make_paper(paper_id="s2-1", title="Paper A")
        p2 = _make_paper(paper_id="s2-2", title="Paper B", venue="ICML 2024")
        bib = papers_to_bibtex([p1, p2])
        assert bib.count("@") == 2
        assert "Paper A" in bib
        assert "Paper B" in bib


class TestSearchPapers:
    def test_search_papers_combines_sources(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """Mock both S2 and arXiv to verify combined search."""
        s2_paper = _make_paper(
            paper_id="s2-1", source="semantic_scholar", citation_count=100
        )
        arxiv_paper = _make_paper(
            paper_id="arxiv-1",
            title="Different Paper",
            doi="10.2/b",
            arxiv_id="2402.99999",
            source="arxiv",
            citation_count=50,
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_semantic_scholar",
            lambda *a, **kw: [s2_paper],
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_arxiv",
            lambda *a, **kw: [arxiv_paper],
        )
        monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None)

        papers = search_papers("test", sources=["semantic_scholar", "arxiv"])
        assert len(papers) == 2
        # Should be sorted by citation_count desc
        assert papers[0].citation_count >= papers[1].citation_count

    def test_default_sources_openalex_first(self) -> None:
        """OpenAlex should be the primary (first) source — least restrictive limits."""
        from researchclaw.literature.search import _DEFAULT_SOURCES
        assert _DEFAULT_SOURCES[0] == "openalex"
        assert "semantic_scholar" in _DEFAULT_SOURCES
        assert "arxiv" in _DEFAULT_SOURCES

    def test_s2_failure_does_not_block_others(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """When S2 fails, other sources should still return results."""
        arxiv_paper = _make_paper(
            paper_id="arxiv-ok", title="ArXiv Paper", source="arxiv",
            doi="10.1/ax", arxiv_id="2401.99991",
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_openalex",
            lambda *a, **kw: [],
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_semantic_scholar",
            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("S2 down")),
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_arxiv",
            lambda *a, **kw: [arxiv_paper],
        )
        monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None)

        papers = search_papers("test")
        assert len(papers) >= 1
        assert papers[0].source == "arxiv"

    def test_search_papers_unknown_source(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None)
        papers = search_papers("test", sources=["unknown_source"])
        assert papers == []

    def test_search_papers_multi_query(self, monkeypatch: pytest.MonkeyPatch) -> None:
        call_count = 0

        def mock_search(*a: Any, **kw: Any) -> list[Paper]:
            nonlocal call_count
            call_count += 1
            return [
                _make_paper(
                    paper_id=f"s2-{call_count}",
                    title=f"Unique Paper {call_count}",
                    doi=f"10.{call_count}/unique",
                    arxiv_id=f"240{call_count}.{call_count:05d}",
                )
            ]

        monkeypatch.setattr(
            "researchclaw.literature.search.search_papers",
            mock_search,
        )
        monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None)

        papers = search_papers_multi_query(["q1", "q2", "q3"])
        assert call_count == 3
        # All unique titles so no dedup
        assert len(papers) == 3


# ──────────────────────────────────────────────────────────────────────
# Edge cases
# ──────────────────────────────────────────────────────────────────────


class TestEdgeCases:
    def test_paper_with_no_meaningful_title_word(self) -> None:
        """cite_key should still work with stopword-only titles."""
        p = _make_paper(title="The And For With", year=2024)
        # All words are stopwords or <4 chars, keyword should be empty
        key = p.cite_key
        assert key.startswith("vaswani2024")

    def test_paper_multiple_authors_bibtex(self) -> None:
        p = _make_paper(
            authors=(
                Author(name="Alice One"),
                Author(name="Bob Two"),
                Author(name="Charlie Three"),
            )
        )
        bib = p.to_bibtex()
        assert "Alice One and Bob Two and Charlie Three" in bib

    def test_empty_s2_response(self) -> None:
        """_parse_s2_paper shouldn't crash on minimal data."""
        p = _parse_s2_paper({"paperId": "x"})
        assert p.paper_id == "s2-x"
        assert p.title == ""
        assert p.authors == ()


# ──────────────────────────────────────────────────────────────────────
# arXiv circuit breaker tests
# ──────────────────────────────────────────────────────────────────────


class TestArxivCircuitBreaker:
    def setup_method(self) -> None:
        from researchclaw.literature.arxiv_client import _reset_circuit_breaker
        _reset_circuit_breaker()

    def test_failure_triggers_circuit_breaker(self) -> None:
        """Three consecutive failures should trip the circuit breaker."""
        from researchclaw.literature import arxiv_client

        # Simulate 3 consecutive failures
        for _ in range(3):
            arxiv_client._cb_on_failure()

        assert arxiv_client._cb_state == arxiv_client._CB_OPEN
        assert arxiv_client._cb_trip_count == 1

    def test_breaker_open_skips_requests(self) -> None:
        """When breaker is OPEN, requests should be skipped."""
        import time as time_mod
        from researchclaw.literature import arxiv_client

        arxiv_client._cb_state = arxiv_client._CB_OPEN
        arxiv_client._cb_open_since = time_mod.monotonic()
        arxiv_client._cb_cooldown_sec = 999

        assert not arxiv_client._cb_should_allow()

    def test_success_resets_breaker(self) -> None:
        """A successful request should reset the circuit breaker."""
        from researchclaw.literature import arxiv_client

        arxiv_client._cb_state = arxiv_client._CB_HALF_OPEN
        arxiv_client._cb_consecutive_429s = 2

        arxiv_client._cb_on_success()
        assert arxiv_client._cb_state == arxiv_client._CB_CLOSED
        assert arxiv_client._cb_consecutive_429s == 0

    def test_half_open_probe_failure_doubles_cooldown(self) -> None:
        """Probe failure in HALF_OPEN should double the cooldown."""
        from researchclaw.literature import arxiv_client

        arxiv_client._cb_state = arxiv_client._CB_HALF_OPEN
        initial_cooldown = arxiv_client._cb_cooldown_sec

        arxiv_client._cb_on_failure()
        assert arxiv_client._cb_state == arxiv_client._CB_OPEN
        assert arxiv_client._cb_cooldown_sec == min(initial_cooldown * 2, 600)

    def test_search_with_http_error(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """search_arxiv should return empty list on HTTPError."""
        import types

        _fake_arxiv = types.ModuleType("arxiv")

        class _FakeHTTPError(Exception):
            pass

        class _FakeUnexpectedEmptyPageError(Exception):
            pass

        _fake_arxiv.HTTPError = _FakeHTTPError
        _fake_arxiv.UnexpectedEmptyPageError = _FakeUnexpectedEmptyPageError
        _fake_arxiv.SortCriterion = MagicMock()
        _fake_arxiv.SortOrder = MagicMock()
        _fake_arxiv.Search = MagicMock()
        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv,
        )

        mock_client = MagicMock()
        mock_client.results.side_effect = _FakeHTTPError("Simulated 429")

        monkeypatch.setattr(
            "researchclaw.literature.arxiv_client._get_client",
            lambda: mock_client,
        )
        from researchclaw.literature.arxiv_client import _reset_circuit_breaker
        _reset_circuit_breaker()

        papers = search_arxiv("test", limit=5)
        assert papers == []


# ──────────────────────────────────────────────────────────────────────
# OpenAlex client tests
# ──────────────────────────────────────────────────────────────────────


SAMPLE_OPENALEX_RESPONSE = {
    "results": [
        {
            "id": "https://openalex.org/W123456",
            "title": "Attention Is All You Need",
            "authorships": [
                {
                    "author": {"display_name": "Ashish Vaswani"},
                    "institutions": [{"display_name": "Google Brain"}],
                }
            ],
            "publication_year": 2017,
            "primary_location": {
                "source": {"display_name": "NeurIPS"}
            },
            "cited_by_count": 85000,
            "doi": "https://doi.org/10.5555/3295222.3295349",
            "ids": {
                "openalex": "https://openalex.org/W123456",
                "arxiv": "https://arxiv.org/abs/1706.03762",
            },
            "abstract_inverted_index": {
                "The": [0],
                "dominant": [1],
                "models": [2, 6],
                "are": [3],
                "based": [4],
                "on": [5],
            },
        }
    ]
}


class TestOpenAlex:
    def test_parse_openalex_response(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Mock urllib to return sample OpenAlex response."""
        from researchclaw.literature.openalex_client import search_openalex

        response_bytes = json.dumps(SAMPLE_OPENALEX_RESPONSE).encode("utf-8")
        mock_resp = MagicMock()
        mock_resp.read.return_value = response_bytes
        mock_resp.__enter__ = lambda s: s
        mock_resp.__exit__ = MagicMock(return_value=False)

        monkeypatch.setattr(
            "researchclaw.literature.openalex_client.urllib.request.urlopen",
            lambda *a, **kw: mock_resp,
        )

        papers = search_openalex("attention", limit=5)
        assert len(papers) == 1
        p = papers[0]
        assert p.title == "Attention Is All You Need"
        assert p.year == 2017
        assert p.citation_count == 85000
        assert p.doi == "10.5555/3295222.3295349"
        assert p.arxiv_id == "1706.03762"
        assert p.source == "openalex"
        assert p.authors[0].name == "Ashish Vaswani"

    def test_abstract_reconstruction(self) -> None:
        from researchclaw.literature.openalex_client import _reconstruct_abstract

        inv_idx = {"Hello": [0], "world": [1], "foo": [3], "bar": [2]}
        result = _reconstruct_abstract(inv_idx)
        assert result == "Hello world bar foo"

    def test_abstract_empty(self) -> None:
        from researchclaw.literature.openalex_client import _reconstruct_abstract
        assert _reconstruct_abstract(None) == ""
        assert _reconstruct_abstract({}) == ""

    def test_openalex_network_error(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Should return empty list on network error."""
        from researchclaw.literature.openalex_client import search_openalex

        monkeypatch.setattr(
            "researchclaw.literature.openalex_client.urllib.request.urlopen",
            lambda *a, **kw: (_ for _ in ()).throw(urllib.error.URLError("timeout")),
        )
        monkeypatch.setattr(
            "researchclaw.literature.openalex_client.time.sleep", lambda _: None,
        )

        papers = search_openalex("test", limit=5)
        assert papers == []


# ──────────────────────────────────────────────────────────────────────
# Multi-source fallback tests
# ──────────────────────────────────────────────────────────────────────


class TestMultiSourceFallback:
    def test_openalex_failure_falls_back_to_s2_and_arxiv(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """When OpenAlex fails, S2 and arXiv should still return results."""
        arxiv_paper = _make_paper(
            paper_id="arxiv-ok", title="ArXiv Paper", source="arxiv",
            doi="10.1/ax", arxiv_id="2401.99999",
        )
        s2_paper = _make_paper(
            paper_id="s2-ok", title="S2 Paper", source="semantic_scholar",
            doi="10.1/s2", arxiv_id="2402.99999",
        )

        monkeypatch.setattr(
            "researchclaw.literature.search.search_openalex",
            lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("OpenAlex down")),
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_semantic_scholar",
            lambda *a, **kw: [s2_paper],
        )
        monkeypatch.setattr(
            "researchclaw.literature.search.search_arxiv",
            lambda *a, **kw: [arxiv_paper],
        )
        monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None)

        papers = search_papers("test")
        assert len(papers) >= 1
        sources = {p.source for p in papers}
        assert "semantic_scholar" in sources or "arxiv" in sources


# ──────────────────────────────────────────────────────────────────────
# Cache TTL tests
# ──────────────────────────────────────────────────────────────────────


class TestCacheTTL:
    def test_source_specific_ttl(self, tmp_path: Any) -> None:
        """arXiv cache should expire after 24h, not 7 days."""
        from researchclaw.literature.cache import get_cached, put_cache, _SOURCE_TTL

        assert _SOURCE_TTL["arxiv"] == 86400  # 24h
        assert _SOURCE_TTL["semantic_scholar"] == 86400 * 3

        # Put and get immediately — should hit
        put_cache("test", "arxiv", 10, [{"paper_id": "x", "title": "Y"}], cache_base=tmp_path)
        result = get_cached("test", "arxiv", 10, cache_base=tmp_path)
        assert result is not None
        assert len(result) == 1

    def test_citation_verify_ttl_is_permanent(self) -> None:
        from researchclaw.literature.cache import _SOURCE_TTL
        assert _SOURCE_TTL["citation_verify"] >= 86400 * 365


import urllib.error


================================================
FILE: tests/test_rc_llm.py
================================================
from __future__ import annotations

import json
import urllib.request
from types import SimpleNamespace
from typing import Any, Mapping

import pytest

from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse, _NEW_PARAM_MODELS


class _DummyHTTPResponse:
    def __init__(self, payload: Mapping[str, Any]):
        self._payload = payload

    def read(self) -> bytes:
        return json.dumps(self._payload).encode("utf-8")

    def __enter__(self) -> _DummyHTTPResponse:
        return self

    def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
        return None


def _make_client(
    *,
    api_key: str = "test-key",
    primary_model: str = "gpt-5.2",
    fallback_models: list[str] | None = None,
    timeout_sec: int = 120,
) -> LLMClient:
    config = LLMConfig(
        base_url="https://api.example.com/v1",
        api_key=api_key,
        primary_model=primary_model,
        fallback_models=fallback_models or ["gpt-5.1", "gpt-4.1", "gpt-4o"],
        timeout_sec=timeout_sec,
    )
    return LLMClient(config)


def _capture_raw_call(
    monkeypatch: pytest.MonkeyPatch, *, model: str, response_data: Mapping[str, Any]
) -> tuple[dict[str, object], LLMResponse, dict[str, object]]:
    captured: dict[str, object] = {}

    def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
        captured["request"] = req
        captured["timeout"] = timeout
        return _DummyHTTPResponse(response_data)

    monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
    client = _make_client()
    resp = client._raw_call(
        model, [{"role": "user", "content": "hello"}], 123, 0.2, False
    )
    request = captured["request"]
    assert isinstance(request, urllib.request.Request)
    data = request.data
    assert isinstance(data, bytes)
    body = json.loads(data.decode("utf-8"))
    assert isinstance(body, dict)
    return body, resp, captured


def test_llm_config_defaults():
    config = LLMConfig(base_url="https://api.example.com/v1", api_key="k")
    assert config.primary_model == "gpt-4o"
    assert config.max_tokens == 4096
    assert config.temperature == 0.7


def test_llm_config_custom_values():
    config = LLMConfig(
        base_url="https://custom.example/v1",
        api_key="custom",
        primary_model="o3",
        fallback_models=["o3-mini"],
        max_tokens=2048,
        temperature=0.1,
        timeout_sec=30,
    )
    assert config.primary_model == "o3"
    assert config.fallback_models == ["o3-mini"]
    assert config.max_tokens == 2048
    assert config.temperature == 0.1
    assert config.timeout_sec == 30


def test_llm_response_dataclass_fields():
    response = LLMResponse(content="ok", model="gpt-5.2", completion_tokens=10)
    assert response.content == "ok"
    assert response.model == "gpt-5.2"
    assert response.completion_tokens == 10


def test_llm_response_defaults():
    response = LLMResponse(content="ok", model="gpt-5.2")
    assert response.prompt_tokens == 0
    assert response.completion_tokens == 0
    assert response.total_tokens == 0
    assert response.finish_reason == ""
    assert response.truncated is False
    assert response.raw == {}


def test_llm_client_initialization_stores_config():
    config = LLMConfig(base_url="https://api.example.com/v1", api_key="k")
    client = LLMClient(config)
    assert client.config is config


def test_llm_client_model_chain_is_primary_plus_fallbacks():
    client = _make_client(
        primary_model="gpt-5.4", fallback_models=["gpt-4.1", "gpt-4o"]
    )
    assert client._model_chain == ["gpt-5.4", "gpt-4.1", "gpt-4o"]


def test_needs_max_completion_tokens_for_new_models():
    model = "gpt-5.2"
    assert any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS)


def test_needs_max_completion_tokens_false_for_old_models():
    model = "gpt-4o"
    assert not any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS)


def test_build_request_body_structure_via_raw_call(monkeypatch: pytest.MonkeyPatch):
    response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]}
    body, _, _ = _capture_raw_call(monkeypatch, model="gpt-4o", response_data=response)
    assert body["model"] == "gpt-4o"
    assert body["messages"] == [{"role": "user", "content": "hello"}]
    assert body["temperature"] == 0.2


def test_build_request_uses_max_completion_tokens_for_new_models(
    monkeypatch: pytest.MonkeyPatch,
):
    response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]}
    body, _, _ = _capture_raw_call(monkeypatch, model="gpt-5.2", response_data=response)
    # Reasoning models enforce a minimum of 32768 tokens
    assert body["max_completion_tokens"] == 32768
    assert "max_tokens" not in body


def test_build_request_uses_max_tokens_for_old_models(monkeypatch: pytest.MonkeyPatch):
    response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]}
    body, _, _ = _capture_raw_call(monkeypatch, model="gpt-4.1", response_data=response)
    assert body["max_tokens"] == 123
    assert "max_completion_tokens" not in body


def test_parse_response_with_valid_payload_via_raw_call(
    monkeypatch: pytest.MonkeyPatch,
):
    response = {
        "model": "gpt-5.2",
        "choices": [{"message": {"content": "hello"}, "finish_reason": "stop"}],
        "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
    }
    _, parsed, _ = _capture_raw_call(
        monkeypatch, model="gpt-5.2", response_data=response
    )
    assert parsed.content == "hello"
    assert parsed.model == "gpt-5.2"
    assert parsed.prompt_tokens == 1
    assert parsed.total_tokens == 3


def test_parse_response_truncated_when_finish_reason_length(
    monkeypatch: pytest.MonkeyPatch,
):
    response = {
        "choices": [{"message": {"content": "partial"}, "finish_reason": "length"}],
        "usage": {},
    }
    _, parsed, _ = _capture_raw_call(
        monkeypatch, model="gpt-5.2", response_data=response
    )
    assert parsed.finish_reason == "length"
    assert parsed.truncated is True


def test_parse_response_missing_optional_fields_graceful(
    monkeypatch: pytest.MonkeyPatch,
):
    response = {"choices": [{"message": {"content": None}}]}
    _, parsed, _ = _capture_raw_call(
        monkeypatch, model="gpt-5.2", response_data=response
    )
    assert parsed.content == ""
    assert parsed.prompt_tokens == 0
    assert parsed.completion_tokens == 0
    assert parsed.total_tokens == 0
    assert parsed.finish_reason == ""


def test_from_rc_config_builds_expected_llm_config():
    rc_config = SimpleNamespace(
        llm=SimpleNamespace(
            base_url="https://proxy.example/v1",
            api_key="inline-key",
            api_key_env="OPENAI_API_KEY",
            primary_model="o3",
            fallback_models=("o3-mini", "gpt-4o"),
        )
    )
    client = LLMClient.from_rc_config(rc_config)
    assert client.config.base_url == "https://proxy.example/v1"
    assert client.config.api_key == "inline-key"
    assert client.config.primary_model == "o3"
    assert client.config.fallback_models == ["o3-mini", "gpt-4o"]


def test_from_rc_config_reads_api_key_from_env_when_missing(
    monkeypatch: pytest.MonkeyPatch,
):
    monkeypatch.setenv("RC_TEST_API_KEY", "env-key")
    rc_config = SimpleNamespace(
        llm=SimpleNamespace(
            base_url="https://proxy.example/v1",
            api_key="",
            api_key_env="RC_TEST_API_KEY",
            primary_model="gpt-5.2",
            fallback_models=(),
        )
    )
    client = LLMClient.from_rc_config(rc_config)
    assert client.config.api_key == "env-key"


def test_new_param_models_contains_expected_models():
    expected = {"gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4", "o3", "o3-mini", "o4-mini"}
    assert expected.issubset(_NEW_PARAM_MODELS)


def test_raw_call_adds_json_mode_response_format(monkeypatch: pytest.MonkeyPatch):
    captured: dict[str, object] = {}

    def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
        captured["request"] = req
        return _DummyHTTPResponse({"choices": [{"message": {"content": "{}"}}]})

    monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
    client = _make_client()
    _ = client._raw_call(
        "gpt-5.2", [{"role": "user", "content": "json"}], 50, 0.1, True
    )
    request = captured["request"]
    assert isinstance(request, urllib.request.Request)
    data = request.data
    assert isinstance(data, bytes)
    body = json.loads(data.decode("utf-8"))
    assert isinstance(body, dict)
    assert body["response_format"] == {"type": "json_object"}


def test_raw_call_sets_auth_and_user_agent_headers(monkeypatch: pytest.MonkeyPatch):
    captured: dict[str, object] = {}

    def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse:
        captured["request"] = req
        captured["timeout"] = timeout
        return _DummyHTTPResponse({"choices": [{"message": {"content": "ok"}}]})

    monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen)
    client = _make_client(api_key="secret", timeout_sec=77)
    _ = client._raw_call("gpt-5.2", [{"role": "user", "content": "hi"}], 20, 0.6, False)
    request = captured["request"]
    assert isinstance(request, urllib.request.Request)
    headers = {k.lower(): v for k, v in request.headers.items()}
    assert headers["authorization"] == "Bearer secret"
    assert "user-agent" in headers
    timeout = captured["timeout"]
    assert timeout == 77


def test_chat_prepends_system_message(monkeypatch: pytest.MonkeyPatch):
    captured: dict[str, list[dict[str, str]]] = {}

    def fake_raw_call(
        self: LLMClient,
        model: str,
        messages: list[dict[str, str]],
        max_tokens: int,
        temperature: float,
        json_mode: bool,
    ) -> LLMResponse:
        captured["messages"] = messages
        return LLMResponse(content="ok", model=model)

    monkeypatch.setattr(LLMClient, "_raw_call", fake_raw_call)
    client = _make_client(primary_model="gpt-5.2", fallback_models=["gpt-4o"])
    client.chat([{"role": "user", "content": "q"}], system="sys")
    assert captured["messages"][0] == {"role": "system", "content": "sys"}


def test_chat_uses_fallback_after_first_model_error(monkeypatch: pytest.MonkeyPatch):
    calls: list[str] = []

    def fake_call_with_retry(
        self: LLMClient,
        model: str,
        messages: list[dict[str, str]],
        max_tokens: int,
        temperature: float,
        json_mode: bool,
    ) -> LLMResponse:
        _ = (self, messages, max_tokens, temperature, json_mode)
        calls.append(model)
        if model == "gpt-5.2":
            raise RuntimeError("first failed")
        return LLMResponse(content="ok", model=model)

    monkeypatch.setattr(LLMClient, "_call_with_retry", fake_call_with_retry)
    client = _make_client(primary_model="gpt-5.2", fallback_models=["gpt-5.1"])
    response = client.chat([{"role": "user", "content": "x"}])
    assert calls == ["gpt-5.2", "gpt-5.1"]
    assert response.model == "gpt-5.1"


================================================
FILE: tests/test_rc_novelty.py
================================================
"""Tests for researchclaw.literature.novelty — novelty detection module."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.literature.novelty import (
    _assess_novelty,
    _build_novelty_queries,
    _compute_similarity,
    _extract_keywords,
    _jaccard_keywords,
    check_novelty,
)


# ---------------------------------------------------------------------------
# _extract_keywords
# ---------------------------------------------------------------------------


class TestExtractKeywords:
    def test_basic_extraction(self) -> None:
        kws = _extract_keywords("Transformer attention mechanisms for NLP")
        assert "transformer" in kws
        assert "attention" in kws
        assert "mechanisms" in kws
        assert "nlp" in kws

    def test_stop_words_removed(self) -> None:
        kws = _extract_keywords("the model is a new approach for data")
        # "the", "is", "a", "new", "approach", "for", "data", "model" are stop words
        assert "the" not in kws
        assert "is" not in kws

    def test_short_tokens_removed(self) -> None:
        kws = _extract_keywords("AI ML RL deep reinforcement learning")
        # "AI", "ML", "RL" are only 2 chars → removed
        assert "ai" not in kws
        assert "deep" in kws
        assert "reinforcement" in kws

    def test_deduplication(self) -> None:
        kws = _extract_keywords("attention attention attention mechanism")
        assert kws.count("attention") == 1

    def test_empty_input(self) -> None:
        assert _extract_keywords("") == []

    def test_preserves_order(self) -> None:
        kws = _extract_keywords("alpha beta gamma delta")
        assert kws == ["alpha", "beta", "gamma", "delta"]


# ---------------------------------------------------------------------------
# _jaccard_keywords
# ---------------------------------------------------------------------------


class TestJaccardKeywords:
    def test_identical_sets(self) -> None:
        assert _jaccard_keywords(["a", "b", "c"], ["a", "b", "c"]) == 1.0

    def test_disjoint_sets(self) -> None:
        assert _jaccard_keywords(["a", "b"], ["c", "d"]) == 0.0

    def test_partial_overlap(self) -> None:
        # {a, b, c} & {b, c, d} = {b, c} / {a, b, c, d} = 2/4 = 0.5
        assert _jaccard_keywords(["a", "b", "c"], ["b", "c", "d"]) == 0.5

    def test_empty_first(self) -> None:
        assert _jaccard_keywords([], ["a", "b"]) == 0.0

    def test_empty_second(self) -> None:
        assert _jaccard_keywords(["a", "b"], []) == 0.0

    def test_both_empty(self) -> None:
        assert _jaccard_keywords([], []) == 0.0


# ---------------------------------------------------------------------------
# _compute_similarity
# ---------------------------------------------------------------------------


class TestComputeSimilarity:
    def test_returns_float_0_to_1(self) -> None:
        sim = _compute_similarity(
            ["transformer", "attention"],
            "Transformer Attention in NLP",
            "We study attention mechanisms in transformer models.",
        )
        assert 0.0 <= sim <= 1.0

    def test_high_similarity_for_matching_content(self) -> None:
        kws = ["transformer", "attention", "mechanisms", "self-attention"]
        sim = _compute_similarity(
            kws,
            "Self-Attention Mechanisms in Transformers",
            "This paper studies transformer self-attention mechanisms in detail.",
        )
        assert sim > 0.1  # should have meaningful overlap

    def test_low_similarity_for_unrelated_content(self) -> None:
        kws = ["quantum", "computing", "entanglement", "qubit"]
        sim = _compute_similarity(
            kws,
            "Deep Learning for Image Classification",
            "We propose a convolutional neural network for classifying images.",
        )
        assert sim < 0.1

    def test_empty_keywords(self) -> None:
        sim = _compute_similarity([], "Some title", "Some abstract")
        assert sim == 0.0


# ---------------------------------------------------------------------------
# _build_novelty_queries
# ---------------------------------------------------------------------------


class TestBuildNoveltyQueries:
    def test_includes_topic(self) -> None:
        queries = _build_novelty_queries("Reinforcement Learning", "No hypotheses")
        assert queries[0] == "Reinforcement Learning"

    def test_extracts_hypothesis_titles(self) -> None:
        hyp_text = (
            "## H1: Adaptive learning rates improve convergence\n"
            "Details about H1...\n\n"
            "## H2: Curriculum learning reduces sample complexity\n"
            "Details about H2...\n"
        )
        queries = _build_novelty_queries("RL topic", hyp_text)
        assert len(queries) >= 3  # topic + H1 + H2

    def test_caps_at_5(self) -> None:
        hyp_text = "\n".join(
            f"## H{i}: Hypothesis number {i} with enough text to pass length filter"
            for i in range(1, 10)
        )
        queries = _build_novelty_queries("Topic", hyp_text)
        assert len(queries) <= 5

    def test_skips_short_titles(self) -> None:
        hyp_text = "## H1: Short\n## H2: This is a longer hypothesis title\n"
        queries = _build_novelty_queries("Topic", hyp_text)
        # "Short" is < 10 chars → excluded
        assert not any("Short" in q for q in queries)

    def test_empty_hypotheses(self) -> None:
        queries = _build_novelty_queries("Topic", "")
        assert len(queries) >= 1
        assert queries[0] == "Topic"


# ---------------------------------------------------------------------------
# _assess_novelty
# ---------------------------------------------------------------------------


class TestAssessNovelty:
    def test_no_similar_papers_is_high(self) -> None:
        score, assessment = _assess_novelty([], 0.25)
        assert score == 1.0
        assert assessment == "high"

    def test_moderate_similarity(self) -> None:
        papers = [{"similarity": 0.35, "citation_count": 10}]
        score, assessment = _assess_novelty(papers, 0.25)
        assert 0.45 <= score <= 0.85
        assert assessment in ("high", "moderate")

    def test_high_similarity_low_novelty(self) -> None:
        papers = [{"similarity": 0.8, "citation_count": 200}]
        score, assessment = _assess_novelty(papers, 0.25)
        assert score <= 0.3
        assert assessment in ("low", "critical")

    def test_multiple_high_impact_overlaps_penalize(self) -> None:
        papers = [
            {"similarity": 0.5, "citation_count": 100},
            {"similarity": 0.45, "citation_count": 80},
            {"similarity": 0.42, "citation_count": 60},
        ]
        score, _ = _assess_novelty(papers, 0.25)
        # Should be penalized for multiple high-citation overlaps
        assert score < 0.6

    def test_score_bounded_0_to_1(self) -> None:
        papers = [{"similarity": 0.99, "citation_count": 5000}]
        score, _ = _assess_novelty(papers, 0.25)
        assert 0.0 <= score <= 1.0

    def test_critical_assessment(self) -> None:
        papers = [
            {"similarity": 0.9, "citation_count": 200},
            {"similarity": 0.85, "citation_count": 150},
        ]
        score, assessment = _assess_novelty(papers, 0.25)
        assert assessment == "critical"
        assert score < 0.25


# ---------------------------------------------------------------------------
# check_novelty (integration)
# ---------------------------------------------------------------------------


class TestCheckNovelty:
    """Integration tests for check_novelty — mocks the real API calls."""

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_basic_flow(self, mock_search: MagicMock) -> None:
        """Smoke test: no similar papers found → high novelty."""
        mock_search.return_value = []
        result = check_novelty(
            topic="Novel quantum-inspired optimization",
            hypotheses_text="## H1: Quantum tunneling improves escape from local minima\n",
        )
        assert isinstance(result, dict)
        assert result["novelty_score"] == 1.0
        assert result["assessment"] in ("high", "insufficient_data")
        assert result["recommendation"] in ("proceed", "proceed_with_caution")
        assert result["topic"] == "Novel quantum-inspired optimization"
        assert "generated" in result

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_with_similar_papers(self, mock_search: MagicMock) -> None:
        """Papers with keyword overlap → lower novelty."""
        # Create a mock paper with overlapping keywords
        mock_paper = MagicMock()
        mock_paper.title = "Quantum-Inspired Optimization for Combinatorial Problems"
        mock_paper.abstract = (
            "We propose quantum-inspired optimization methods "
            "using tunneling and superposition analogies to escape local minima."
        )
        mock_paper.paper_id = "abc123"
        mock_paper.year = 2024
        mock_paper.venue = "NeurIPS"
        mock_paper.citation_count = 45
        mock_paper.url = "https://example.com/paper"
        mock_paper.cite_key = "abc2024quantum"

        mock_search.return_value = [mock_paper]
        result = check_novelty(
            topic="Quantum-inspired optimization",
            hypotheses_text="## H1: Quantum tunneling improves escape from local minima\n",
        )
        assert result["similar_papers_found"] >= 0
        assert 0.0 <= result["novelty_score"] <= 1.0

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_with_pipeline_papers(self, mock_search: MagicMock) -> None:
        """Papers from candidates.jsonl also checked for overlap."""
        mock_search.return_value = []
        pipeline_papers = [
            {
                "title": "Adaptive Learning Rate Schedules via Meta-Learning",
                "abstract": "We study adaptive learning rate schedules.",
                "paper_id": "p1",
                "year": 2023,
                "venue": "ICML",
                "citation_count": 30,
                "url": "https://example.com",
                "cite_key": "p12023",
            },
        ]
        result = check_novelty(
            topic="Adaptive learning rate schedules",
            hypotheses_text="## H1: Meta-learning adaptive learning rate schedules\n",
            papers_already_seen=pipeline_papers,
        )
        assert isinstance(result, dict)
        assert "similar_papers" in result

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_search_failure_graceful(self, mock_search: MagicMock) -> None:
        """API failure should not crash — falls back to pipeline papers."""
        mock_search.side_effect = RuntimeError("API down")
        result = check_novelty(
            topic="Some topic",
            hypotheses_text="## H1: Some hypothesis with enough text\n",
        )
        assert isinstance(result, dict)
        assert "novelty_score" in result

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_output_keys_complete(self, mock_search: MagicMock) -> None:
        """All expected keys present in output."""
        mock_search.return_value = []
        result = check_novelty(
            topic="Test topic",
            hypotheses_text="Some hypotheses text",
        )
        expected_keys = {
            "topic",
            "hypotheses_checked",
            "search_queries",
            "similar_papers_found",
            "novelty_score",
            "assessment",
            "similar_papers",
            "recommendation",
            "similarity_threshold",
            "search_coverage",
            "total_papers_retrieved",
            "generated",
        }
        assert expected_keys == set(result.keys())

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_recommendation_values(self, mock_search: MagicMock) -> None:
        """Recommendation must be one of proceed/differentiate/abort."""
        mock_search.return_value = []
        result = check_novelty(
            topic="Test",
            hypotheses_text="## H1: Hypothesis one\n",
        )
        assert result["recommendation"] in ("proceed", "differentiate", "abort", "proceed_with_caution")

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_json_serializable(self, mock_search: MagicMock) -> None:
        """Output must be JSON-serializable for writing to novelty_report.json."""
        mock_search.return_value = []
        result = check_novelty(
            topic="JSON test",
            hypotheses_text="## H1: Test hypothesis title is long enough\n",
        )
        serialized = json.dumps(result)
        assert isinstance(serialized, str)

    @patch("researchclaw.literature.search.search_papers_multi_query")
    def test_similar_papers_capped_at_20(self, mock_search: MagicMock) -> None:
        """Output similar_papers list capped at 20."""
        # Create many mock papers
        papers = []
        for i in range(40):
            p = MagicMock()
            p.title = f"Paper about optimization variant {i}"
            p.abstract = "Optimization variant study"
            p.paper_id = f"id_{i}"
            p.year = 2024
            p.venue = "Conf"
            p.citation_count = 10
            p.url = f"https://example.com/{i}"
            p.cite_key = f"key{i}"
            papers.append(p)
        mock_search.return_value = papers
        result = check_novelty(
            topic="optimization",
            hypotheses_text="## H1: Optimization variants improve performance\n",
            similarity_threshold=0.0,  # low threshold → many matches
        )
        assert len(result["similar_papers"]) <= 20


# ---------------------------------------------------------------------------
# Executor integration — _execute_hypothesis_gen with novelty check
# ---------------------------------------------------------------------------


class TestHypothesisGenNoveltyIntegration:
    """Test that _execute_hypothesis_gen integrates novelty check correctly."""

    def test_novelty_report_written_when_available(self, tmp_path: Path) -> None:
        """Hypothesis gen should write novelty_report.json when check succeeds."""
        from researchclaw.pipeline.executor import _execute_hypothesis_gen
        from researchclaw.adapters import AdapterBundle
        from researchclaw.config import RCConfig

        # Set up minimal run directory
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-08"
        stage_dir.mkdir()

        # Create synthesis artifact from prior stage
        stage_07 = run_dir / "stage-07"
        stage_07.mkdir()
        (stage_07 / "synthesis.md").write_text("## Synthesis\nSome synthesis content.")

        data = {
            "project": {"name": "novelty-test", "mode": "docs-first"},
            "research": {"topic": "novelty testing"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local"},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline",
            },
        }
        config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        adapters = AdapterBundle()

        with patch(
            "researchclaw.literature.search.search_papers_multi_query"
        ) as mock_search:
            mock_search.return_value = []
            result = _execute_hypothesis_gen(stage_dir, run_dir, config, adapters)

        assert result.stage.name == "HYPOTHESIS_GEN"
        assert result.status.name == "DONE"
        # hypotheses.md always written
        assert (stage_dir / "hypotheses.md").exists()
        # novelty_report.json should be written (API mocked as returning empty)
        assert (stage_dir / "novelty_report.json").exists()
        report = json.loads((stage_dir / "novelty_report.json").read_text())
        assert report["novelty_score"] == 1.0  # no similar papers → max novelty
        assert "novelty_report.json" in result.artifacts

    def test_novelty_failure_does_not_block(self, tmp_path: Path) -> None:
        """If novelty check crashes, hypothesis gen still succeeds."""
        from researchclaw.pipeline.executor import _execute_hypothesis_gen
        from researchclaw.adapters import AdapterBundle
        from researchclaw.config import RCConfig

        run_dir = tmp_path / "run"
        run_dir.mkdir()
        stage_dir = run_dir / "stage-08"
        stage_dir.mkdir()

        stage_07 = run_dir / "stage-07"
        stage_07.mkdir()
        (stage_07 / "synthesis.md").write_text("## Synthesis\nContent.")

        data = {
            "project": {"name": "novelty-test", "mode": "docs-first"},
            "research": {"topic": "novelty testing"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local"},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost:1234/v1",
                "api_key_env": "RC_TEST_KEY",
                "api_key": "inline",
            },
        }
        config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        adapters = AdapterBundle()

        with patch(
            "researchclaw.literature.novelty.check_novelty",
            side_effect=RuntimeError("Novelty check exploded"),
        ):
            result = _execute_hypothesis_gen(stage_dir, run_dir, config, adapters)

        assert result.status.name == "DONE"
        assert (stage_dir / "hypotheses.md").exists()
        # novelty_report.json NOT written since check failed
        assert not (stage_dir / "novelty_report.json").exists()
        assert "novelty_report.json" not in result.artifacts


================================================
FILE: tests/test_rc_preflight.py
================================================
from __future__ import annotations

import urllib.error
from email.message import Message
from unittest.mock import patch

from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse


def _make_client(
    *,
    base_url: str = "https://api.example.com/v1",
    api_key: str = "test-key",
    primary_model: str = "gpt-test",
    fallback_models: list[str] | None = None,
    max_retries: int = 1,
) -> LLMClient:
    return LLMClient(
        LLMConfig(
            base_url=base_url,
            api_key=api_key,
            primary_model=primary_model,
            fallback_models=fallback_models or [],
            max_retries=max_retries,
        )
    )


class TestPreflight:
    def test_preflight_success(self):
        client = _make_client()
        mock_resp = LLMResponse(content="pong", model="gpt-test")
        with patch.object(client, "chat", return_value=mock_resp):
            ok, msg = client.preflight()
        assert ok is True
        assert "OK" in msg
        assert "gpt-test" in msg

    def test_preflight_401_invalid_key(self):
        client = _make_client()
        err = urllib.error.HTTPError("url", 401, "Unauthorized", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "Invalid API key" in msg

    def test_preflight_403_model_forbidden(self):
        client = _make_client()
        err = urllib.error.HTTPError("url", 403, "Forbidden", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "not allowed" in msg

    def test_preflight_404_bad_endpoint(self):
        client = _make_client()
        err = urllib.error.HTTPError("url", 404, "Not Found", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "Endpoint not found" in msg

    def test_preflight_429_rate_limited(self):
        client = _make_client()
        err = urllib.error.HTTPError("url", 429, "Too Many Requests", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "Rate limited" in msg

    def test_preflight_timeout(self):
        client = _make_client()
        err = urllib.error.URLError("timeout")
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "Connection failed" in msg

    def test_preflight_all_models_failed(self):
        client = _make_client()
        err = RuntimeError("All models failed. Last error: ...")
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "All models failed" in msg

    def test_preflight_unknown_http_error(self):
        client = _make_client()
        err = urllib.error.HTTPError("url", 500, "Server Error", Message(), None)
        with patch.object(client, "chat", side_effect=err):
            ok, msg = client.preflight()
        assert ok is False
        assert "HTTP 500" in msg


================================================
FILE: tests/test_rc_prompts.py
================================================
"""Tests for researchclaw.prompts — PromptManager and template rendering."""

from __future__ import annotations

import textwrap
from pathlib import Path

import pytest
import yaml

from researchclaw.prompts import (
    PromptManager,
    RenderedPrompt,
    _render,
)


# ---------------------------------------------------------------------------
# _render() — template variable substitution
# ---------------------------------------------------------------------------


class TestRender:
    def test_simple_substitution(self) -> None:
        assert _render("Hello {name}!", {"name": "World"}) == "Hello World!"

    def test_multiple_variables(self) -> None:
        result = _render(
            "Topic: {topic}, Domain: {domain}", {"topic": "RL", "domain": "ML"}
        )
        assert result == "Topic: RL, Domain: ML"

    def test_missing_variable_left_untouched(self) -> None:
        assert _render("Value: {unknown}", {}) == "Value: {unknown}"

    def test_json_schema_not_substituted(self) -> None:
        template = "Return JSON: {candidates:[...]} with >=8 rows."
        assert _render(template, {"candidates": "SHOULD_NOT_APPEAR"}) == template

    def test_json_schema_complex_not_substituted(self) -> None:
        template = "Schema: {score_1_to_10:number, verdict:string}"
        assert _render(template, {}) == template

    def test_curly_braces_in_code_not_substituted(self) -> None:
        template = "def foo(): { return 1; }"
        assert _render(template, {}) == template

    def test_underscore_variable(self) -> None:
        assert _render("{my_var}", {"my_var": "ok"}) == "ok"

    def test_numeric_suffix(self) -> None:
        assert _render("{score_1}", {"score_1": "9"}) == "9"

    def test_empty_template(self) -> None:
        assert _render("", {"x": "y"}) == ""

    def test_no_placeholders(self) -> None:
        assert _render("No variables here", {"x": "y"}) == "No variables here"


# ---------------------------------------------------------------------------
# PromptManager — defaults
# ---------------------------------------------------------------------------


class TestPromptManagerDefaults:
    def test_all_stages_present(self) -> None:
        """20 stages have for_stage() prompts; iterative_refine uses sub_prompts only."""
        pm = PromptManager()
        names = pm.stage_names()
        assert len(names) >= 20
        for required in [
            "topic_init",
            "problem_decompose",
            "search_strategy",
            "literature_collect",
            "literature_screen",
            "knowledge_extract",
            "synthesis",
            "hypothesis_gen",
            "experiment_design",
            "code_generation",
            "resource_planning",
            "result_analysis",
            "research_decision",
            "paper_outline",
            "paper_draft",
            "peer_review",
            "paper_revision",
            "quality_gate",
            "knowledge_archive",
            "export_publish",
        ]:
            assert pm.has_stage(required), f"Missing stage: {required}"

    def test_system_prompt_nonempty(self) -> None:
        pm = PromptManager()
        for name in pm.stage_names():
            assert pm.system(name), f"Empty system prompt for {name}"

    def test_for_stage_returns_rendered_prompt(self) -> None:
        pm = PromptManager()
        sp = pm.for_stage(
            "topic_init",
            topic="RL",
            domains="ml",
            project_name="test",
            quality_threshold="4.0",
        )
        assert isinstance(sp, RenderedPrompt)
        assert "RL" in sp.user
        assert "ml" in sp.user
        assert sp.system

    def test_json_mode_stages(self) -> None:
        pm = PromptManager()
        json_stages = [
            "search_strategy",
            "literature_collect",
            "literature_screen",
            "knowledge_extract",
            "resource_planning",
            "quality_gate",
        ]
        for stage in json_stages:
            assert pm.json_mode(stage), f"{stage} should have json_mode=True"

    def test_non_json_stages(self) -> None:
        pm = PromptManager()
        assert not pm.json_mode("topic_init")
        assert not pm.json_mode("synthesis")

    def test_max_tokens(self) -> None:
        pm = PromptManager()
        assert pm.max_tokens("code_generation") == 8192
        assert pm.max_tokens("paper_draft") == 16384
        assert pm.max_tokens("topic_init") is None

    def test_block_topic_constraint(self) -> None:
        pm = PromptManager()
        block = pm.block("topic_constraint", topic="Neural Architecture Search")
        assert "Neural Architecture Search" in block
        assert "HARD TOPIC CONSTRAINT" in block

    def test_block_pkg_hint(self) -> None:
        pm = PromptManager()
        block = pm.block("pkg_hint_sandbox")
        assert "numpy" in block
        assert "torch" in block  # mentioned as prohibited

    def test_sub_prompt_code_repair(self) -> None:
        pm = PromptManager()
        rp = pm.sub_prompt(
            "code_repair",
            fname="model.py",
            issues_text="SyntaxError",
            all_files_ctx="...",
        )
        assert "model.py" in rp.user
        assert "SyntaxError" in rp.user
        assert rp.system

    def test_sub_prompt_iterative_improve(self) -> None:
        pm = PromptManager()
        ip = pm.sub_prompt(
            "iterative_improve",
            metric_key="val_loss",
            metric_direction="minimize",
            files_context="...",
            run_summaries="...",
        )
        assert "val_loss" in ip.user
        assert "minimize" in ip.user

    def test_sub_prompt_iterative_repair(self) -> None:
        pm = PromptManager()
        irp = pm.sub_prompt(
            "iterative_repair", issue_text="import error", all_files_ctx="..."
        )
        assert "import error" in irp.user


# ---------------------------------------------------------------------------
# PromptManager — YAML override
# ---------------------------------------------------------------------------


class TestPromptManagerOverrides:
    def test_override_system_prompt(self, tmp_path: Path) -> None:
        yaml_content = textwrap.dedent("""\
            stages:
              topic_init:
                system: "You are a custom planner."
        """)
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(yaml_content, encoding="utf-8")
        pm = PromptManager(override_file)
        assert pm.system("topic_init") == "You are a custom planner."
        # Other stages should keep defaults
        assert pm.system("problem_decompose") == "You are a senior research strategist."

    def test_override_user_template(self, tmp_path: Path) -> None:
        yaml_content = textwrap.dedent("""\
            stages:
              topic_init:
                user: "Custom prompt for {topic}."
        """)
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(yaml_content, encoding="utf-8")
        pm = PromptManager(override_file)
        result = pm.user("topic_init", topic="GAN")
        assert result == "Custom prompt for GAN."

    def test_override_block(self, tmp_path: Path) -> None:
        yaml_content = textwrap.dedent("""\
            blocks:
              topic_constraint: "Stay focused on {topic}."
        """)
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(yaml_content, encoding="utf-8")
        pm = PromptManager(override_file)
        assert pm.block("topic_constraint", topic="NAS") == "Stay focused on NAS."

    def test_override_json_mode(self, tmp_path: Path) -> None:
        yaml_content = textwrap.dedent("""\
            stages:
              topic_init:
                json_mode: true
        """)
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(yaml_content, encoding="utf-8")
        pm = PromptManager(override_file)
        assert pm.json_mode("topic_init") is True

    def test_missing_file_uses_defaults(self, tmp_path: Path) -> None:
        pm = PromptManager(tmp_path / "nonexistent.yaml")
        assert pm.has_stage("topic_init")
        assert pm.system("topic_init")

    def test_invalid_yaml_uses_defaults(self, tmp_path: Path) -> None:
        bad_file = tmp_path / "bad.yaml"
        bad_file.write_text(": invalid: yaml: [", encoding="utf-8")
        pm = PromptManager(bad_file)
        assert pm.has_stage("topic_init")

    def test_unknown_stage_in_override_ignored(self, tmp_path: Path) -> None:
        yaml_content = textwrap.dedent("""\
            stages:
              nonexistent_stage:
                system: "Should be ignored."
        """)
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(yaml_content, encoding="utf-8")
        # Should not raise
        pm = PromptManager(override_file)
        assert not pm.has_stage("nonexistent_stage")


# ---------------------------------------------------------------------------
# PromptManager — export_yaml
# ---------------------------------------------------------------------------


class TestExportYaml:
    def test_export_roundtrip(self, tmp_path: Path) -> None:
        pm1 = PromptManager()
        export_path = tmp_path / "exported.yaml"
        pm1.export_yaml(export_path)
        assert export_path.exists()

        # Load it back — should parse cleanly
        data = yaml.safe_load(export_path.read_text(encoding="utf-8"))
        assert "stages" in data
        assert "blocks" in data
        assert "version" in data

    def test_export_contains_all_stages(self, tmp_path: Path) -> None:
        pm = PromptManager()
        export_path = tmp_path / "exported.yaml"
        pm.export_yaml(export_path)
        data = yaml.safe_load(export_path.read_text(encoding="utf-8"))
        for stage in pm.stage_names():
            assert stage in data["stages"], f"Missing {stage} in export"

    def test_export_with_overrides(self, tmp_path: Path) -> None:
        override_file = tmp_path / "custom.yaml"
        override_file.write_text(
            "stages:\n  topic_init:\n    system: CUSTOM\n",
            encoding="utf-8",
        )
        pm = PromptManager(override_file)
        export_path = tmp_path / "exported.yaml"
        pm.export_yaml(export_path)
        data = yaml.safe_load(export_path.read_text(encoding="utf-8"))
        assert data["stages"]["topic_init"]["system"] == "CUSTOM"


# ---------------------------------------------------------------------------
# RenderedPrompt dataclass
# ---------------------------------------------------------------------------


class TestRenderedPrompt:
    def test_defaults(self) -> None:
        rp = RenderedPrompt(system="sys", user="usr")
        assert rp.json_mode is False
        assert rp.max_tokens is None

    def test_with_options(self) -> None:
        rp = RenderedPrompt(system="s", user="u", json_mode=True, max_tokens=4096)
        assert rp.json_mode is True
        assert rp.max_tokens == 4096

    def test_frozen(self) -> None:
        rp = RenderedPrompt(system="s", user="u")
        with pytest.raises(AttributeError):
            rp.system = "modified"  # type: ignore[misc]


================================================
FILE: tests/test_rc_quality.py
================================================
"""Tests for content quality assessment."""

from __future__ import annotations

# pyright: reportMissingImports=false, reportUnknownVariableType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false

from researchclaw.quality import (
    assess_quality,
    check_strict_quality,
    compute_template_ratio,
    detect_template_content,
)

REAL_ABSTRACT = (
    "We propose a novel method for protein structure prediction using "
    "graph neural networks. Our approach achieves state-of-the-art results "
    "on the CASP14 benchmark with 3.2 GDT-TS improvement over AlphaFold2. "
    "We demonstrate that incorporating side-chain interactions as graph "
    "edges significantly improves local structure accuracy."
)

TEMPLATE_ABSTRACT = (
    "Template abstract: This section will describe the main contributions "
    "of our work. [INSERT your abstract here]. We will discuss the results "
    "in the following sections. Replace this text with your actual content."
)

MIXED_CONTENT = (
    "We propose a novel method for protein structure prediction.\n"
    "[TODO: Add more details about the method]\n"
    "Our experiments show significant improvements over baselines.\n"
    "Template introduction: This section will describe the background."
)

REAL_PAPER_SECTION = (
    "## Introduction\n\n"
    "Recent advances in large language models have demonstrated remarkable "
    "capabilities in natural language understanding and generation. However, "
    "these models often struggle with factual consistency and hallucinate "
    "information. In this work, we address this limitation by introducing "
    "a retrieval-augmented generation framework that grounds model outputs "
    "in verified knowledge sources.\n\n"
    "Our key contributions are:\n"
    "1. A novel attention mechanism for integrating retrieved passages\n"
    "2. A training procedure that incentivizes factual consistency\n"
    "3. Comprehensive evaluation on three benchmark datasets"
)


class TestDetectTemplateContent:
    def test_real_text_no_matches(self):
        matches = detect_template_content(REAL_ABSTRACT)
        assert len(matches) == 0

    def test_template_text_has_matches(self):
        matches = detect_template_content(TEMPLATE_ABSTRACT)
        assert len(matches) >= 3

    def test_detects_insert_placeholder(self):
        text = "The results show [INSERT your results here] improvement."
        matches = detect_template_content(text)
        assert any("Insert placeholder" in m.pattern_desc for m in matches)

    def test_detects_todo_placeholder(self):
        text = "Method description [TODO: complete this section]."
        matches = detect_template_content(text)
        assert any("TODO" in m.pattern_desc for m in matches)

    def test_detects_template_section(self):
        text = "Template introduction: This paper presents our work."
        matches = detect_template_content(text)
        assert any("Template section" in m.pattern_desc for m in matches)

    def test_detects_future_tense_placeholder(self):
        text = "This section will describe the methodology in detail."
        matches = detect_template_content(text)
        assert any("Future-tense" in m.pattern_desc for m in matches)

    def test_detects_lorem_ipsum(self):
        text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
        matches = detect_template_content(text)
        assert any("Lorem ipsum" in m.pattern_desc for m in matches)

    def test_match_has_line_number(self):
        text = "Good line\n[TODO: fix this]\nAnother good line"
        matches = detect_template_content(text)
        assert len(matches) == 1
        assert matches[0].line_number == 2

    def test_real_paper_section_clean(self):
        matches = detect_template_content(REAL_PAPER_SECTION)
        assert len(matches) == 0

    def test_empty_text(self):
        matches = detect_template_content("")
        assert len(matches) == 0


class TestComputeTemplateRatio:
    def test_real_text_low_ratio(self):
        ratio = compute_template_ratio(REAL_ABSTRACT)
        assert ratio < 0.05

    def test_template_text_high_ratio(self):
        ratio = compute_template_ratio(TEMPLATE_ABSTRACT)
        assert ratio > 0.5

    def test_mixed_content_moderate_ratio(self):
        ratio = compute_template_ratio(MIXED_CONTENT)
        assert 0.1 < ratio < 0.9

    def test_empty_text_zero_ratio(self):
        ratio = compute_template_ratio("")
        assert ratio == 0.0

    def test_ratio_bounded_0_1(self):
        ratio = compute_template_ratio(TEMPLATE_ABSTRACT)
        assert 0.0 <= ratio <= 1.0

    def test_real_paper_section_low_ratio(self):
        ratio = compute_template_ratio(REAL_PAPER_SECTION)
        assert ratio < 0.05


class TestAssessQuality:
    def test_report_has_all_fields(self):
        report = assess_quality(REAL_ABSTRACT)
        assert report.total_lines > 0
        assert report.total_chars > 0
        assert isinstance(report.template_ratio, float)
        assert isinstance(report.template_matches, tuple)

    def test_report_to_dict(self):
        report = assess_quality(MIXED_CONTENT)
        d = report.to_dict()
        assert "template_ratio" in d
        assert "template_matches" in d
        assert "has_template_content" in d
        assert "match_count" in d

    def test_report_has_template_flag(self):
        report = assess_quality(TEMPLATE_ABSTRACT)
        assert report.has_template_content is True

        report2 = assess_quality(REAL_ABSTRACT)
        assert report2.has_template_content is False


class TestCheckStrictQuality:
    def test_real_text_passes(self):
        passed, _msg = check_strict_quality(REAL_ABSTRACT)
        assert passed is True

    def test_template_text_fails(self):
        passed, msg = check_strict_quality(TEMPLATE_ABSTRACT)
        assert passed is False
        assert "Template content detected" in msg

    def test_custom_threshold(self):
        passed, _msg = check_strict_quality(TEMPLATE_ABSTRACT, threshold=1.0)
        assert passed is True

    def test_failure_message_includes_examples(self):
        _passed, msg = check_strict_quality(TEMPLATE_ABSTRACT)
        assert "L" in msg


================================================
FILE: tests/test_rc_report.py
================================================
# pyright: basic, reportMissingImports=false, reportUnusedCallResult=false
from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.report import generate_report


class TestReport:
    def test_report_missing_run_dir(self, tmp_path: Path):
        with pytest.raises(FileNotFoundError):
            generate_report(tmp_path / "nonexistent")

    def test_report_no_summary(self, tmp_path: Path):
        with pytest.raises(ValueError, match="pipeline_summary"):
            generate_report(tmp_path)

    def test_report_minimal_run(self, tmp_path: Path):
        (tmp_path / "pipeline_summary.json").write_text(
            json.dumps(
                {
                    "run_id": "rc-test-123",
                    "stages_executed": 23,
                    "stages_done": 23,
                    "stages_blocked": 0,
                    "stages_failed": 0,
                    "final_status": "done",
                    "generated": "2026-03-10T12:00:00Z",
                }
            )
        )
        report = generate_report(tmp_path)
        assert "# ResearchClaw Run Report" in report
        assert "rc-test-123" in report
        assert "✅" in report

    def test_report_with_paper(self, tmp_path: Path):
        (tmp_path / "pipeline_summary.json").write_text(
            json.dumps(
                {
                    "run_id": "test",
                    "stages_executed": 1,
                    "stages_done": 1,
                    "stages_failed": 0,
                    "final_status": "done",
                    "generated": "now",
                }
            )
        )
        draft_dir = tmp_path / "stage-17"
        draft_dir.mkdir()
        (draft_dir / "paper_draft.md").write_text(
            "This is a paper with some words in it."
        )
        report = generate_report(tmp_path)
        assert "Paper" in report
        assert "words" in report

    def test_report_with_citations(self, tmp_path: Path):
        (tmp_path / "pipeline_summary.json").write_text(
            json.dumps(
                {
                    "run_id": "test",
                    "stages_executed": 1,
                    "stages_done": 1,
                    "stages_failed": 0,
                    "final_status": "done",
                    "generated": "now",
                }
            )
        )
        verify_dir = tmp_path / "stage-23"
        verify_dir.mkdir()
        (verify_dir / "verification_report.json").write_text(
            json.dumps(
                {
                    "total_references": 10,
                    "verified_count": 8,
                    "suspicious_count": 1,
                    "hallucinated_count": 1,
                }
            )
        )
        report = generate_report(tmp_path)
        assert "Citations" in report
        assert "8/10" in report
        assert "Suspicious: 1" in report

    def test_report_with_failures(self, tmp_path: Path):
        (tmp_path / "pipeline_summary.json").write_text(
            json.dumps(
                {
                    "run_id": "test",
                    "stages_executed": 5,
                    "stages_done": 3,
                    "stages_failed": 2,
                    "final_status": "failed",
                    "generated": "now",
                }
            )
        )
        report = generate_report(tmp_path)
        assert "❌" in report
        assert "Warnings" in report
        assert "2 stage(s) failed" in report

    def test_report_with_experiment_results(self, tmp_path: Path):
        (tmp_path / "pipeline_summary.json").write_text(
            json.dumps(
                {
                    "run_id": "test",
                    "stages_executed": 1,
                    "stages_done": 1,
                    "stages_failed": 0,
                    "final_status": "done",
                    "generated": "now",
                }
            )
        )
        exp_dir = tmp_path / "stage-12"
        exp_dir.mkdir()
        (exp_dir / "experiment_results.json").write_text(
            json.dumps(
                {
                    "iterations": [{"loss": 0.5}, {"loss": 0.3}],
                    "best_metric": 0.3,
                }
            )
        )
        report = generate_report(tmp_path)
        assert "Experiments" in report
        assert "2 iterations" in report


================================================
FILE: tests/test_rc_runner.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false
from __future__ import annotations

import json
from pathlib import Path
from typing import Any, cast

import pytest

from researchclaw.adapters import AdapterBundle
from researchclaw.config import RCConfig
from researchclaw.pipeline import runner as rc_runner
from researchclaw.pipeline.executor import StageResult
from researchclaw.pipeline.stages import STAGE_SEQUENCE, Stage, StageStatus


@pytest.fixture()
def rc_config(tmp_path: Path) -> RCConfig:
    data = {
        "project": {"name": "rc-runner-test", "mode": "docs-first"},
        "research": {"topic": "pipeline testing"},
        "runtime": {"timezone": "UTC"},
        "notifications": {"channel": "local"},
        "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
        "openclaw_bridge": {},
        "llm": {
            "provider": "openai-compatible",
            "base_url": "http://localhost:1234/v1",
            "api_key_env": "RC_TEST_KEY",
            "api_key": "inline",
        },
    }
    return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)


@pytest.fixture()
def adapters() -> AdapterBundle:
    return AdapterBundle()


@pytest.fixture()
def run_dir(tmp_path: Path) -> Path:
    path = tmp_path / "run"
    path.mkdir()
    return path


def _done(stage: Stage, artifacts: tuple[str, ...] = ("out.md",)) -> StageResult:
    return StageResult(stage=stage, status=StageStatus.DONE, artifacts=artifacts)


def _failed(stage: Stage, msg: str = "boom") -> StageResult:
    return StageResult(stage=stage, status=StageStatus.FAILED, artifacts=(), error=msg)


def _blocked(stage: Stage) -> StageResult:
    return StageResult(
        stage=stage,
        status=StageStatus.BLOCKED_APPROVAL,
        artifacts=("gate.md",),
        decision="block",
    )


def test_execute_pipeline_runs_stages_in_sequence(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-seq",
        config=rc_config,
        adapters=adapters,
    )
    assert seen == list(STAGE_SEQUENCE)
    assert len(results) == 23
    assert all(r.status == StageStatus.DONE for r in results)


def test_execute_pipeline_stops_on_failed_stage(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    fail_stage = Stage.SEARCH_STRATEGY

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        if stage == fail_stage:
            return _failed(stage, "forced failure")
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-fail",
        config=rc_config,
        adapters=adapters,
    )
    assert results[-1].stage == fail_stage
    assert results[-1].status == StageStatus.FAILED
    assert len(results) == int(fail_stage)


def test_execute_pipeline_stops_on_gate_when_stop_on_gate_enabled(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    gate_stage = Stage.LITERATURE_SCREEN

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        if stage == gate_stage:
            return _blocked(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-gate-stop",
        config=rc_config,
        adapters=adapters,
        stop_on_gate=True,
    )
    assert results[-1].stage == gate_stage
    assert results[-1].status == StageStatus.BLOCKED_APPROVAL
    assert len(results) == int(gate_stage)


def test_execute_pipeline_continues_after_gate_when_stop_on_gate_disabled(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    gate_stage = Stage.LITERATURE_SCREEN

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        if stage == gate_stage:
            return _blocked(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-gate-continue",
        config=rc_config,
        adapters=adapters,
        stop_on_gate=False,
    )
    assert len(results) == 23
    assert any(item.status == StageStatus.BLOCKED_APPROVAL for item in results)


def test_execute_pipeline_writes_pipeline_summary_json(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-summary",
        config=rc_config,
        adapters=adapters,
    )
    summary_path = run_dir / "pipeline_summary.json"
    assert summary_path.exists()


def test_pipeline_summary_has_expected_fields_and_values(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        if stage == Stage.LITERATURE_SCREEN:
            return _blocked(stage)
        if stage == Stage.HYPOTHESIS_GEN:
            return _failed(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-summary-fields",
        config=rc_config,
        adapters=adapters,
    )
    summary = cast(
        dict[str, Any],
        json.loads((run_dir / "pipeline_summary.json").read_text(encoding="utf-8")),
    )
    assert summary["run_id"] == "run-summary-fields"
    assert summary["stages_executed"] == len(results)
    assert summary["stages_done"] == sum(
        1 for r in results if r.status == StageStatus.DONE
    )
    assert summary["stages_blocked"] == 1
    assert summary["stages_failed"] == 1
    assert summary["from_stage"] == 1
    assert summary["final_stage"] == int(Stage.HYPOTHESIS_GEN)
    assert summary["final_status"] == "failed"
    assert "generated" in summary


def test_execute_pipeline_from_stage_skips_earlier_stages(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-from-stage",
        config=rc_config,
        adapters=adapters,
        from_stage=Stage.PAPER_OUTLINE,
    )
    assert seen[0] == Stage.PAPER_OUTLINE
    assert len(seen) == len(STAGE_SEQUENCE) - (int(Stage.PAPER_OUTLINE) - 1)
    assert len(results) == len(seen)


def test_execute_pipeline_writes_kb_entries_when_kb_root_provided(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
    tmp_path: Path,
) -> None:
    calls: list[tuple[int, str, str]] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        stage_dir = run_dir / f"stage-{int(stage):02d}"
        stage_dir.mkdir(parents=True, exist_ok=True)
        (stage_dir / "out.md").write_text(f"stage {int(stage)}", encoding="utf-8")
        return _done(stage)

    def mock_write_stage_to_kb(
        kb_root: Path,
        stage_id: int,
        stage_name: str,
        run_id: str,
        artifacts: list[str],
        stage_dir: Path,
        **kwargs,
    ):
        _ = kb_root, artifacts, stage_dir, kwargs
        calls.append((stage_id, stage_name, run_id))
        return []

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    monkeypatch.setattr(rc_runner, "write_stage_to_kb", mock_write_stage_to_kb)

    kb_root = tmp_path / "kb-out"
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-kb",
        config=rc_config,
        adapters=adapters,
        kb_root=kb_root,
    )
    assert len(results) == 23
    assert len(calls) == 23
    assert calls[0] == (1, "topic_init", "run-kb")


def test_execute_pipeline_passes_auto_approve_flag_to_execute_stage(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    received: list[bool] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        received.append(kwargs["auto_approve_gates"])
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-auto-approve",
        config=rc_config,
        adapters=adapters,
        auto_approve_gates=True,
    )
    assert received
    assert all(received)


@pytest.mark.parametrize(
    ("stage", "started", "expected"),
    [
        (Stage.TOPIC_INIT, False, True),
        (Stage.PROBLEM_DECOMPOSE, False, False),
        (Stage.PAPER_DRAFT, True, True),
    ],
)
def test_should_start_logic(stage: Stage, started: bool, expected: bool) -> None:
    assert rc_runner._should_start(stage, Stage.TOPIC_INIT, started) is expected


@pytest.mark.parametrize(
    ("results", "expected_status", "expected_final_stage"),
    [
        ([], "no_stages", int(Stage.TOPIC_INIT)),
        ([_done(Stage.TOPIC_INIT)], "done", int(Stage.TOPIC_INIT)),
        (
            [_done(Stage.TOPIC_INIT), _failed(Stage.PROBLEM_DECOMPOSE)],
            "failed",
            int(Stage.PROBLEM_DECOMPOSE),
        ),
    ],
)
def test_build_pipeline_summary_core_fields(
    results, expected_status: str, expected_final_stage: int
) -> None:
    summary = rc_runner._build_pipeline_summary(
        run_id="run-core",
        results=results,
        from_stage=Stage.TOPIC_INIT,
    )
    assert summary["run_id"] == "run-core"
    assert summary["final_status"] == expected_status
    assert summary["final_stage"] == expected_final_stage


def test_pipeline_prints_stage_progress(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
    capsys: pytest.CaptureFixture[str],
) -> None:
    mock_results = [
        StageResult(
            stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("topic.json",)
        ),
        StageResult(
            stage=Stage.PROBLEM_DECOMPOSE,
            status=StageStatus.DONE,
            artifacts=("tree.json",),
        ),
        StageResult(
            stage=Stage.SEARCH_STRATEGY,
            status=StageStatus.FAILED,
            artifacts=(),
            error="LLM timeout",
        ),
    ]

    call_idx = 0

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = stage, kwargs
        nonlocal call_idx
        idx = call_idx
        call_idx += 1
        return mock_results[min(idx, len(mock_results) - 1)]

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    monkeypatch.setattr(rc_runner, "write_stage_to_kb", lambda *args, **kwargs: [])

    _ = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="rc-test-001",
        config=rc_config,
        adapters=adapters,
    )

    captured = capsys.readouterr()
    assert "TOPIC_INIT — running..." in captured.out
    assert "TOPIC_INIT — done" in captured.out
    assert "SEARCH_STRATEGY — FAILED" in captured.out
    assert "LLM timeout" in captured.out


def test_pipeline_prints_elapsed_time(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
    capsys: pytest.CaptureFixture[str],
) -> None:
    mock_result = StageResult(
        stage=Stage.TOPIC_INIT,
        status=StageStatus.DONE,
        artifacts=("topic.json",),
    )
    mock_fail = StageResult(
        stage=Stage.PROBLEM_DECOMPOSE,
        status=StageStatus.FAILED,
        artifacts=(),
        error="test",
    )
    results_iter = iter([mock_result, mock_fail])

    monkeypatch.setattr(
        rc_runner, "execute_stage", lambda *args, **kwargs: next(results_iter)
    )
    monkeypatch.setattr(rc_runner, "write_stage_to_kb", lambda *args, **kwargs: [])

    _ = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="rc-test-002",
        config=rc_config,
        adapters=adapters,
    )

    captured = capsys.readouterr()
    import re

    assert re.search(r"\d+\.\d+s\)", captured.out), (
        f"No elapsed time found in: {captured.out}"
    )


# ── PIVOT/PROCEED/REFINE decision loop tests ──


def _pivot_result(stage: Stage) -> StageResult:
    return StageResult(
        stage=stage, status=StageStatus.DONE, artifacts=("decision.md",), decision="pivot"
    )


def _refine_result(stage: Stage) -> StageResult:
    return StageResult(
        stage=stage, status=StageStatus.DONE, artifacts=("decision.md",), decision="refine"
    )


def test_pivot_decision_triggers_rollback_to_hypothesis_gen(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []
    pivot_count = 0

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        nonlocal pivot_count
        if stage == Stage.RESEARCH_DECISION and pivot_count == 0:
            pivot_count += 1
            return _pivot_result(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-pivot",
        config=rc_config,
        adapters=adapters,
    )
    # Should have seen HYPOTHESIS_GEN at least twice (original + rollback)
    hyp_gen_count = sum(1 for s in seen if s == Stage.HYPOTHESIS_GEN)
    assert hyp_gen_count >= 2
    # Decision history should be recorded
    history_path = run_dir / "decision_history.json"
    assert history_path.exists()
    history = json.loads(history_path.read_text())
    assert len(history) == 1
    assert history[0]["decision"] == "pivot"


def test_refine_decision_triggers_rollback_to_iterative_refine(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []
    refine_count = 0

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        nonlocal refine_count
        if stage == Stage.RESEARCH_DECISION and refine_count == 0:
            refine_count += 1
            return _refine_result(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-refine",
        config=rc_config,
        adapters=adapters,
    )
    # Should have seen ITERATIVE_REFINE at least twice
    refine_stage_count = sum(1 for s in seen if s == Stage.ITERATIVE_REFINE)
    assert refine_stage_count >= 2


def test_max_pivot_count_prevents_infinite_loop(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        # Always PIVOT — should be limited by MAX_DECISION_PIVOTS
        if stage == Stage.RESEARCH_DECISION:
            return _pivot_result(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-max-pivot",
        config=rc_config,
        adapters=adapters,
    )
    # RESEARCH_DECISION should appear at most MAX_DECISION_PIVOTS + 1 times
    from researchclaw.pipeline.stages import MAX_DECISION_PIVOTS
    decision_count = sum(1 for s in seen if s == Stage.RESEARCH_DECISION)
    assert decision_count <= MAX_DECISION_PIVOTS + 1


def test_proceed_decision_does_not_trigger_rollback(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
) -> None:
    seen: list[Stage] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-proceed",
        config=rc_config,
        adapters=adapters,
    )
    # Should be exactly 23 stages, no rollback
    assert len(seen) == 23
    assert not (run_dir / "decision_history.json").exists()


def test_read_pivot_count_returns_zero_for_no_history(run_dir: Path) -> None:
    assert rc_runner._read_pivot_count(run_dir) == 0


def test_record_decision_history_appends(run_dir: Path) -> None:
    rc_runner._record_decision_history(run_dir, "pivot", Stage.HYPOTHESIS_GEN, 1)
    rc_runner._record_decision_history(run_dir, "refine", Stage.ITERATIVE_REFINE, 2)
    history = json.loads((run_dir / "decision_history.json").read_text())
    assert len(history) == 2
    assert history[0]["decision"] == "pivot"
    assert history[1]["decision"] == "refine"


# ── Deliverables packaging tests ──


def _setup_stage_artifacts(run_dir: Path) -> None:
    """Create typical stage-22 and stage-23 output files for testing."""
    s22 = run_dir / "stage-22"
    s22.mkdir(parents=True, exist_ok=True)
    (s22 / "paper_final.md").write_text("# My Paper\nContent here.", encoding="utf-8")
    (s22 / "paper.tex").write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}", encoding="utf-8")
    (s22 / "references.bib").write_text("@article{smith2024,\n  title={Test}\n}", encoding="utf-8")
    code_dir = s22 / "code"
    code_dir.mkdir()
    (code_dir / "main.py").write_text("print('hello')", encoding="utf-8")
    (code_dir / "requirements.txt").write_text("numpy\n", encoding="utf-8")
    (code_dir / "README.md").write_text("# Code\n", encoding="utf-8")

    s23 = run_dir / "stage-23"
    s23.mkdir(parents=True, exist_ok=True)
    (s23 / "paper_final_verified.md").write_text("# My Paper (verified)\nContent.", encoding="utf-8")
    (s23 / "references_verified.bib").write_text("@article{smith2024,\n  title={Test}\n}", encoding="utf-8")
    (s23 / "verification_report.json").write_text(
        json.dumps({"summary": {"total": 5, "verified": 4}}), encoding="utf-8"
    )


def test_package_deliverables_collects_all_artifacts(
    run_dir: Path, rc_config: RCConfig
) -> None:
    _setup_stage_artifacts(run_dir)
    dest = rc_runner._package_deliverables(run_dir, "run-pkg-test", rc_config)
    assert dest is not None
    assert dest == run_dir / "deliverables"
    assert (dest / "paper_final.md").exists()
    assert (dest / "paper.tex").exists()
    assert (dest / "references.bib").exists()
    assert (dest / "code" / "main.py").exists()
    assert (dest / "verification_report.json").exists()
    assert (dest / "manifest.json").exists()
    manifest = json.loads((dest / "manifest.json").read_text())
    assert manifest["run_id"] == "run-pkg-test"
    assert "paper_final.md" in manifest["files"]


def test_package_deliverables_prefers_verified_versions(
    run_dir: Path, rc_config: RCConfig
) -> None:
    _setup_stage_artifacts(run_dir)
    rc_runner._package_deliverables(run_dir, "run-verified", rc_config)
    dest = run_dir / "deliverables"
    # Should contain verified content (from stage 23), not base (from stage 22)
    paper = (dest / "paper_final.md").read_text(encoding="utf-8")
    assert "verified" in paper
    bib = (dest / "references.bib").read_text(encoding="utf-8")
    assert "smith2024" in bib


def test_package_deliverables_falls_back_to_stage22(
    run_dir: Path, rc_config: RCConfig
) -> None:
    """When stage 23 outputs are missing, falls back to stage 22 versions."""
    s22 = run_dir / "stage-22"
    s22.mkdir(parents=True, exist_ok=True)
    (s22 / "paper_final.md").write_text("# Base Paper", encoding="utf-8")
    (s22 / "references.bib").write_text("@article{a,title={A}}", encoding="utf-8")

    dest = rc_runner._package_deliverables(run_dir, "run-fallback", rc_config)
    assert dest is not None
    paper = (dest / "paper_final.md").read_text(encoding="utf-8")
    assert "Base Paper" in paper


def test_package_deliverables_returns_none_when_no_stage_artifacts(
    run_dir: Path, tmp_path: Path,
) -> None:
    """Returns None when no stage artifacts exist and no style files found."""
    # Use a config with an unknown conference so style files aren't bundled
    data = {
        "project": {"name": "empty-test", "mode": "docs-first"},
        "research": {"topic": "empty"},
        "runtime": {"timezone": "UTC"},
        "notifications": {"channel": "local"},
        "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
        "openclaw_bridge": {},
        "llm": {
            "provider": "openai-compatible",
            "base_url": "http://localhost:1234/v1",
            "api_key_env": "RC_TEST_KEY",
            "api_key": "inline",
        },
        "export": {"target_conference": "unknown_conf_9999"},
    }
    cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
    result = rc_runner._package_deliverables(run_dir, "run-empty", cfg)
    assert result is None
    assert not (run_dir / "deliverables").exists()


def test_package_deliverables_includes_style_files(
    run_dir: Path, rc_config: RCConfig
) -> None:
    """Style files (.sty, .bst) for the target conference are bundled."""
    _setup_stage_artifacts(run_dir)
    dest = rc_runner._package_deliverables(run_dir, "run-styles", rc_config)
    assert dest is not None
    # Default config uses neurips_2025 → should have neurips_2025.sty
    assert (dest / "neurips_2025.sty").exists()
    manifest = json.loads((dest / "manifest.json").read_text())
    assert "neurips_2025.sty" in manifest["files"]


# ── Atomic checkpoint write tests ──


def test_write_checkpoint_uses_atomic_rename(run_dir: Path) -> None:
    """Checkpoint must be written via temp file + rename, not direct write"""
    rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-atomic")
    cp = run_dir / "checkpoint.json"
    assert cp.exists()
    data = json.loads(cp.read_text(encoding="utf-8"))
    assert data["last_completed_stage"] == int(Stage.TOPIC_INIT)
    assert data["run_id"] == "run-atomic"


def test_write_checkpoint_leaves_no_temp_files(run_dir: Path) -> None:
    """Atomic write must clean up temp files on success"""
    rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-clean")
    temps = list(run_dir.glob("*.tmp"))
    assert temps == [], f"Leftover temp files: {temps}"


def test_write_checkpoint_preserves_old_on_write_failure(
    run_dir: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    """If the temp-file write fails, the existing checkpoint must survive"""
    import builtins

    rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-ok")

    original_open = builtins.open

    def _exploding_open(path, *args, **kwargs):
        # After os.close(fd), _write_checkpoint opens via path string —
        # intercept temp-file opens (checkpoint_*.tmp)
        if isinstance(path, (str, Path)) and "checkpoint_" in str(path):
            raise OSError("disk full")
        if isinstance(path, int):
            raise OSError("disk full")
        return original_open(path, *args, **kwargs)

    monkeypatch.setattr(builtins, "open", _exploding_open)
    with pytest.raises(OSError):
        rc_runner._write_checkpoint(run_dir, Stage.PROBLEM_DECOMPOSE, "run-ok")

    # Original checkpoint must be intact
    data = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8"))
    assert data["last_completed_stage"] == int(Stage.TOPIC_INIT)
    # Temp file must be cleaned up
    assert list(run_dir.glob("checkpoint_*.tmp")) == []


def test_write_checkpoint_overwrites_previous(run_dir: Path) -> None:
    """A second checkpoint call must fully replace the first"""
    rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-1")
    rc_runner._write_checkpoint(run_dir, Stage.PROBLEM_DECOMPOSE, "run-1")
    data = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8"))
    assert data["last_completed_stage"] == int(Stage.PROBLEM_DECOMPOSE)
    assert data["last_completed_name"] == Stage.PROBLEM_DECOMPOSE.name


def _degraded(stage: Stage) -> StageResult:
    return StageResult(
        stage=stage,
        status=StageStatus.DONE,
        artifacts=("quality_report.json",),
        decision="degraded",
    )


def test_degraded_quality_gate_continues_pipeline(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
    capsys: pytest.CaptureFixture[str],
) -> None:
    """When quality gate returns decision='degraded', pipeline continues to completion."""
    seen: list[Stage] = []

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        _ = kwargs
        seen.append(stage)
        if stage == Stage.QUALITY_GATE:
            return _degraded(stage)
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    results = rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-degraded",
        config=rc_config,
        adapters=adapters,
    )
    # All 23 stages should execute (not stopped at quality gate)
    assert len(results) == 23
    assert seen == list(STAGE_SEQUENCE)
    # Quality gate result should have decision="degraded"
    qg_result = [r for r in results if r.stage == Stage.QUALITY_GATE][0]
    assert qg_result.decision == "degraded"
    assert qg_result.status == StageStatus.DONE
    # Pipeline summary should have degraded=True
    summary = json.loads((run_dir / "pipeline_summary.json").read_text())
    assert summary["degraded"] is True
    # Output should show DEGRADED message
    captured = capsys.readouterr()
    assert "DEGRADED" in captured.out


def test_package_deliverables_called_after_pipeline(
    monkeypatch: pytest.MonkeyPatch,
    run_dir: Path,
    rc_config: RCConfig,
    adapters: AdapterBundle,
    capsys: pytest.CaptureFixture[str],
) -> None:
    """Deliverables packaging is called at end of execute_pipeline."""
    _setup_stage_artifacts(run_dir)

    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
        return _done(stage)

    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
    rc_runner.execute_pipeline(
        run_dir=run_dir,
        run_id="run-with-deliverables",
        config=rc_config,
        adapters=adapters,
    )
    captured = capsys.readouterr()
    assert "Deliverables packaged" in captured.out
    assert (run_dir / "deliverables" / "manifest.json").exists()


# ---------------------------------------------------------------------------
# BUG-223: _promote_best_stage14 must always write experiment_summary_best.json
# ---------------------------------------------------------------------------

def _make_stage14_summary(run_dir: Path, suffix: str, pm_value: float) -> None:
    """Helper: create a stage-14{suffix}/experiment_summary.json."""
    d = run_dir / f"stage-14{suffix}"
    d.mkdir(parents=True, exist_ok=True)
    data = {
        "metrics_summary": {
            "primary_metric": {"min": pm_value, "max": pm_value, "mean": pm_value, "count": 1}
        },
        "condition_summaries": {"cond_a": {"metrics": {"primary_metric": pm_value}}},
    }
    (d / "experiment_summary.json").write_text(json.dumps(data), encoding="utf-8")


class TestPromoteBestStage14BestJson:
    """BUG-223: experiment_summary_best.json must be written even when
    stage-14/ already has the best result (early-return path)."""

    @pytest.fixture()
    def max_config(self, rc_config: RCConfig) -> RCConfig:
        """Config with metric_direction=maximize (accuracy-like metrics)."""
        object.__setattr__(rc_config.experiment, "metric_direction", "maximize")
        return rc_config

    def test_best_json_written_when_current_is_best(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """stage-14/ already best → should still write best.json."""
        _make_stage14_summary(run_dir, "", 90.0)
        _make_stage14_summary(run_dir, "_v1", 80.0)
        _make_stage14_summary(run_dir, "_v2", 70.0)

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        assert best_path.exists(), "experiment_summary_best.json must always be written"
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 90.0

    def test_best_json_written_when_promotion_needed(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """stage-14/ is NOT best → promote + write best.json."""
        _make_stage14_summary(run_dir, "", 70.0)
        _make_stage14_summary(run_dir, "_v1", 95.0)

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        assert best_path.exists()
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 95.0

    def test_best_json_written_with_equal_values(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """BUG-223 exact scenario: stage-14 and stage-14_v1 have equal
        metrics, stage-14_v2 is regressed."""
        _make_stage14_summary(run_dir, "", 64.46)
        _make_stage14_summary(run_dir, "_v1", 64.46)
        _make_stage14_summary(run_dir, "_v2", 26.80)

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        assert best_path.exists(), "BUG-223: best.json missing when current is tied-best"
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 64.46


class TestPromoteBestStage14AnalysisBest:
    """BUG-225: analysis_best.md must be written from best stage-14 iteration."""

    @pytest.fixture()
    def max_config(self, rc_config: RCConfig) -> RCConfig:
        object.__setattr__(rc_config.experiment, "metric_direction", "maximize")
        return rc_config

    def test_analysis_best_written_from_best_iteration(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """analysis_best.md should come from the best stage-14 iteration."""
        _make_stage14_summary(run_dir, "", 70.0)
        _make_stage14_summary(run_dir, "_v1", 95.0)
        # Write analysis.md in each
        (run_dir / "stage-14" / "analysis.md").write_text("Degenerate analysis", encoding="utf-8")
        (run_dir / "stage-14_v1" / "analysis.md").write_text("Best analysis v1", encoding="utf-8")

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        best_analysis = run_dir / "analysis_best.md"
        assert best_analysis.exists(), "BUG-225: analysis_best.md must be written"
        assert best_analysis.read_text(encoding="utf-8") == "Best analysis v1"

    def test_analysis_best_written_when_current_is_best(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """Even when stage-14 is already best, analysis_best.md should be written."""
        _make_stage14_summary(run_dir, "", 90.0)
        _make_stage14_summary(run_dir, "_v1", 80.0)
        (run_dir / "stage-14" / "analysis.md").write_text("Best analysis current", encoding="utf-8")
        (run_dir / "stage-14_v1" / "analysis.md").write_text("Worse analysis", encoding="utf-8")

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        best_analysis = run_dir / "analysis_best.md"
        assert best_analysis.exists()
        assert best_analysis.read_text(encoding="utf-8") == "Best analysis current"

    def test_no_analysis_best_when_no_analysis_md(
        self, run_dir: Path, max_config: RCConfig
    ) -> None:
        """If best stage-14 has no analysis.md, no analysis_best.md is written."""
        _make_stage14_summary(run_dir, "", 90.0)

        rc_runner._promote_best_stage14(run_dir, max_config)  # type: ignore[attr-defined]

        assert not (run_dir / "analysis_best.md").exists()


class TestPromoteBestStage14DegenerateDetection:
    """BUG-226: Degenerate near-zero metrics must not be promoted as best."""

    def test_degenerate_minimize_skipped(self, run_dir: Path, rc_config: RCConfig) -> None:
        """When minimize, a value 1000x smaller than second-best is degenerate."""
        # metric_direction defaults to "minimize"
        _make_stage14_summary(run_dir, "", 7.26e-8)   # degenerate (broken normalization)
        _make_stage14_summary(run_dir, "_v2", 0.37)   # valid

        rc_runner._promote_best_stage14(run_dir, rc_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        assert best_path.exists()
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 0.37, "Degenerate value should be skipped, valid v2 promoted"

    def test_legitimate_minimize_not_skipped(self, run_dir: Path, rc_config: RCConfig) -> None:
        """When values are within normal range, smaller is legitimately best."""
        _make_stage14_summary(run_dir, "", 0.15)
        _make_stage14_summary(run_dir, "_v1", 0.37)

        rc_runner._promote_best_stage14(run_dir, rc_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 0.15, "Legitimate lower value should be promoted"

    def test_single_candidate_not_affected(self, run_dir: Path, rc_config: RCConfig) -> None:
        """Single candidate is never skipped regardless of value."""
        _make_stage14_summary(run_dir, "", 1e-10)

        rc_runner._promote_best_stage14(run_dir, rc_config)  # type: ignore[attr-defined]

        best_path = run_dir / "experiment_summary_best.json"
        data = json.loads(best_path.read_text(encoding="utf-8"))
        pm = data["metrics_summary"]["primary_metric"]
        assert pm["mean"] == 1e-10


================================================
FILE: tests/test_rc_sanitization.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false
from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.pipeline.executor import _sanitize_fabricated_data
from researchclaw.pipeline.stage_impls._code_generation import _check_rl_compatibility


@pytest.fixture()
def run_dir(tmp_path: Path) -> Path:
    path = tmp_path / "run"
    path.mkdir()
    return path


def _write_experiment_summary(run_dir: Path, data: dict) -> None:
    stage14 = run_dir / "stage-14"
    stage14.mkdir(parents=True, exist_ok=True)
    (stage14 / "experiment_summary.json").write_text(
        json.dumps(data, indent=2), encoding="utf-8"
    )


def test_sanitize_replaces_unverified_numbers(run_dir: Path) -> None:
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85, "f1": 0.82},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    paper = (
        "## Results\n\n"
        "| Method | Accuracy | F1 | Precision |\n"
        "| --- | --- | --- | --- |\n"
        "| Ours | 0.85 | 0.82 | 0.91 |\n"
        "| Baseline | 0.73 | 0.65 | 0.78 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)

    # 0.85 and 0.82 should be kept (verified), 0.91, 0.73, 0.65, 0.78 replaced
    assert "0.85" in sanitized
    assert "0.82" in sanitized
    assert "0.91" not in sanitized
    assert "0.73" not in sanitized
    assert "---" in sanitized
    assert report["sanitized"] is True
    assert report["numbers_replaced"] == 4
    assert report["numbers_kept"] == 2


def test_sanitize_preserves_table_structure(run_dir: Path) -> None:
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"loss": 0.12},
    })
    paper = (
        "| Model | Loss |\n"
        "| --- | --- |\n"
        "| A | 0.12 |\n"
        "| B | 0.8765 |\n"
    )
    sanitized, _ = _sanitize_fabricated_data(paper, run_dir)
    # Table pipes should still be intact
    assert sanitized.count("|") == paper.count("|")
    assert "0.12" in sanitized
    assert "0.8765" not in sanitized


def test_sanitize_no_experiment_summary(run_dir: Path) -> None:
    paper = "| A | 0.5 |\n| --- | --- |\n| B | 0.6 |\n"
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    assert report["sanitized"] is False
    assert sanitized == paper  # unchanged


def test_sanitize_tolerance_within_1_percent(run_dir: Path) -> None:
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 100.0},
    })
    paper = (
        "| Method | Acc |\n"
        "| --- | --- |\n"
        "| Ours | 100.5 |\n"  # within 1% of 100.0
        "| Other | 110.0 |\n"  # outside 1%
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    assert "100.5" in sanitized  # kept (within tolerance)
    assert "110.0" not in sanitized  # replaced


def test_sanitize_header_row_preserved(run_dir: Path) -> None:
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"val": 5.0},
    })
    paper = (
        "| Col1 | Col2 |\n"
        "| --- | --- |\n"
        "| data | 99.9 |\n"
    )
    sanitized, _ = _sanitize_fabricated_data(paper, run_dir)
    # Header row should be untouched
    assert "| Col1 | Col2 |" in sanitized


def test_sanitize_hp_columns_preserved_in_mixed_table(run_dir: Path) -> None:
    """BUG-184: HP columns in mixed tables should not be sanitized."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    paper = (
        "## Results\n\n"
        "| Method | LR | Batch Size | Accuracy | F1 |\n"
        "| --- | --- | --- | --- | --- |\n"
        "| Ours | 0.0007 | 48 | 0.85 | 0.91 |\n"
        "| Baseline | 0.0001 | 24 | 0.73 | 0.78 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # HP columns (LR, Batch Size) should be preserved regardless of verification
    assert "0.0007" in sanitized, "HP column 'LR' value should not be sanitized"
    assert "0.0001" in sanitized, "HP column 'LR' value should not be sanitized"
    # Result columns: 0.85 verified → kept; 0.91, 0.73, 0.78 → replaced
    assert "0.85" in sanitized
    assert "0.91" not in sanitized
    assert "0.73" not in sanitized


def test_sanitize_pure_hp_table_skipped(run_dir: Path) -> None:
    """BUG-192: Pure HP tables (header keywords) should be fully skipped."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
    })
    paper = (
        "| Hyperparameter | Value |\n"
        "| --- | --- |\n"
        "| Learning Rate | 0.0007 |\n"
        "| Batch Size | 48 |\n"
        "| Weight Decay | 0.0005 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # Entire table should be skipped — no sanitization at all
    assert "0.0007" in sanitized
    assert "0.0005" in sanitized
    assert report["tables_processed"] == 0


def test_prose_sanitization_replaces_unverified(run_dir: Path) -> None:
    """Prose numbers in Results section should be sanitized."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    paper = (
        "# Introduction\n"
        "Prior work achieved 92.3% accuracy on this task.\n\n"
        "# Results\n"
        "Our method achieved 85.0% accuracy, which is significantly better.\n"
        "The baseline obtained 72.4% accuracy on the same benchmark.\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # 85.0 is verified (matches 0.85 × 100), should be kept
    assert "85.0" in sanitized
    # 72.4 is unverified in Results → replaced
    assert "72.4" not in sanitized
    assert "[value removed]" in sanitized
    # 92.3 is in Introduction (not Results) → should be preserved
    assert "92.3" in sanitized
    assert report["prose_numbers_replaced"] >= 1


def test_sanitize_model_name_numbers_preserved(run_dir: Path) -> None:
    """BUG-206: Numbers in model names (ResNet-34) must not be replaced."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    # Table with model variant numbers in the first column (ci=1, skipped)
    paper = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "| --- | --- |\n"
        "| ResNet-34 (baseline) | 0.85 |\n"
        "| ResNet-50 (teacher) | 0.91 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # First column is method names — must be preserved (includes "34", "50")
    assert "ResNet-34" in sanitized, "Model name 'ResNet-34' should not be sanitized"
    assert "ResNet-50" in sanitized, "Model name 'ResNet-50' should not be sanitized"


def test_sanitize_unicode_hyphen_model_names_preserved(run_dir: Path) -> None:
    """BUG-206: Unicode non-breaking hyphen in model names must not be replaced."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    # U+2011 non-breaking hyphen (common LLM output)
    paper = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "| --- | --- |\n"
        "| ResNet\u201134 (baseline) | 0.85 |\n"
        "| ResNet\u201150 (teacher) | 0.91 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    assert "ResNet\u201134" in sanitized, "Model name with U+2011 hyphen should not be sanitized"
    assert "ResNet\u201150" in sanitized, "Model name with U+2011 hyphen should not be sanitized"


def test_prose_sanitization_preserves_introduction(run_dir: Path) -> None:
    """Numbers outside Results/Experiments should NOT be touched."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"val": 0.50},
    })
    paper = (
        "# Introduction\n"
        "Previous methods achieved 94.2% accuracy.\n\n"
        "# Related Work\n"
        "Smith et al. reported 88.7% on the benchmark.\n\n"
        "# Conclusion\n"
        "We demonstrated 50.0% accuracy.\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # None of these sections are Results/Experiments → all preserved
    assert "94.2" in sanitized
    assert "88.7" in sanitized
    assert report["prose_numbers_replaced"] == 0


# ---------------------------------------------------------------------------
# RL compatibility check (Improvement G)
# ---------------------------------------------------------------------------


def test_rl_compatibility_dqn_continuous_detected() -> None:
    """DQN + continuous env should produce errors."""
    code = """
import gymnasium as gym
from stable_baselines3 import DQN

env = gym.make('Pendulum-v1')
model = DQN('MlpPolicy', env)
model.learn(total_timesteps=10000)
"""
    errors = _check_rl_compatibility(code)
    assert len(errors) >= 1
    assert "DQN" in errors[0]
    assert "pendulum" in errors[0].lower()


def test_rl_compatibility_ppo_continuous_ok() -> None:
    """PPO + continuous env should be fine."""
    code = """
import gymnasium as gym
from stable_baselines3 import PPO

env = gym.make('HalfCheetah-v5')
model = PPO('MlpPolicy', env)
model.learn(total_timesteps=100000)
"""
    errors = _check_rl_compatibility(code)
    assert len(errors) == 0


def test_sanitize_reads_promoted_best_data(run_dir: Path) -> None:
    """BUG-222: Sanitizer uses experiment_summary_best.json (promoted best).

    After REFINE, the pipeline promotes the best iteration's data to
    experiment_summary_best.json.  The sanitizer should validate against
    that file, not scan all refinement logs.
    """
    # Stale stage-14 data (from a regressed iteration)
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"primary_metric": {"min": 8.42, "max": 8.91, "mean": 8.6467, "count": 3}},
        "best_run": {"metrics": {"primary_metric": 8.65}},
    })
    # Promoted best data (from the winning iteration)
    (run_dir / "experiment_summary_best.json").write_text(
        json.dumps({
            "metrics_summary": {"primary_metric": {"min": 73.07, "max": 78.93, "mean": 75.56, "count": 3}},
            "best_run": {"metrics": {"primary_metric": 78.93}},
            "condition_summaries": {
                "Ours": {"metrics": {"primary_metric": 78.93}},
                "SGD": {"metrics": {"primary_metric": 73.07}},
                "AdamW": {"metrics": {"primary_metric": 68.67}},
            },
        }, indent=2), encoding="utf-8"
    )
    # Paper uses values from promoted best
    paper = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "| --- | --- |\n"
        "| Ours | 78.93 |\n"
        "| SGD | 73.07 |\n"
        "| AdamW | 68.67 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    assert "78.93" in sanitized
    assert "73.07" in sanitized
    assert "68.67" in sanitized
    assert report["numbers_kept"] == 3
    assert report["numbers_replaced"] == 0


def test_sanitize_rejects_regressed_refine_data(run_dir: Path) -> None:
    """BUG-222: Regressed REFINE iteration data must NOT pass sanitizer.

    Reproduces the Run 75 fabrication bypass: v1 had 74.52%, v3 regressed
    to 69.30%.  Paper cited v3 numbers.  The sanitizer should reject them.
    """
    # v1 (best) promoted to experiment_summary_best.json
    (run_dir / "experiment_summary_best.json").write_text(
        json.dumps({
            "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}},
            "condition_summaries": {
                "FeatureKD": {"metrics": {"metric": 0.7452}},
                "Teacher": {"metrics": {"metric": 0.7431}},
            },
            "metrics_summary": {"metric": {"mean": 0.7442, "min": 0.7431, "max": 0.7452}},
        }, indent=2), encoding="utf-8"
    )
    # v3 (regressed) in stage-14 (stale)
    _write_experiment_summary(run_dir, {
        "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}},
        "condition_summaries": {
            "FeatureKD": {"metrics": {"metric": 0.6930}},
            "Teacher": {"metrics": {"metric": 0.7292}},
        },
        "metrics_summary": {"metric": {"mean": 0.7111, "min": 0.6930, "max": 0.7292}},
    })
    # v3 sandbox data in refinement_log
    stage13 = run_dir / "stage-13_v2"
    stage13.mkdir(parents=True, exist_ok=True)
    (stage13 / "refinement_log.json").write_text(json.dumps({
        "iterations": [{"sandbox": {"metrics": {"primary_metric": 0.6930}}}]
    }), encoding="utf-8")
    # Paper fabricates v3 numbers
    paper = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "| --- | --- |\n"
        "| FeatureKD | 69.30 |\n"
        "| Teacher | 72.92 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # 69.30 should be REPLACED — it's from regressed v3, not promoted v1
    assert "69.30" not in sanitized
    assert report["numbers_replaced"] >= 1
    # But 74.52 or 74.31 (v1 best) would pass if cited
    paper_v1 = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "| --- | --- |\n"
        "| FeatureKD | 74.52 |\n"
        "| Teacher | 74.31 |\n"
    )
    sanitized_v1, report_v1 = _sanitize_fabricated_data(paper_v1, run_dir)
    assert "74.52" in sanitized_v1
    assert "74.31" in sanitized_v1
    assert report_v1["numbers_replaced"] == 0


def test_sanitize_condition_names_with_decimals_preserved(run_dir: Path) -> None:
    """BUG-210: Condition names with decimal params (ema_decay_0.9) must not be damaged."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 73.07},
        "best_run": {"metrics": {"accuracy": 73.07}},
    })
    paper = (
        "## Results\n\n"
        "| Condition | Accuracy |\n"
        "| --- | --- |\n"
        "| ema_decay_0.9 | 73.07 |\n"
        "| ema_decay_0.99 | 69.33 |\n"
        "| swa_start_0.75 | 68.67 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # First column (condition names) must be completely preserved
    assert "ema_decay_0.9 " in sanitized, "Condition name 'ema_decay_0.9' damaged"
    assert "ema_decay_0.99" in sanitized, "Condition name 'ema_decay_0.99' damaged"
    assert "swa_start_0.75" in sanitized, "Condition name 'swa_start_0.75' damaged"
    # 73.07 is verified → kept
    assert "73.07" in sanitized


def test_rl_compatibility_dqn_discrete_ok() -> None:
    """DQN + discrete env (CartPole) should be fine."""
    code = """
import gymnasium as gym
from stable_baselines3 import DQN

env = gym.make('CartPole-v1')
model = DQN('MlpPolicy', env)
"""
    errors = _check_rl_compatibility(code)
    assert len(errors) == 0


# ---------------------------------------------------------------------------
# BUG-211: LaTeX tabular sanitization
# ---------------------------------------------------------------------------


def test_sanitize_latex_tabular_replaces_unverified(run_dir: Path) -> None:
    """BUG-211: Numbers inside \\begin{tabular} must be sanitized."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.4816},
        "best_run": {"metrics": {"accuracy": 0.4816}},
    })
    paper = (
        "## Results\n\n"
        "```latex\n"
        "\\begin{table}[htbp]\n"
        "\\centering\n"
        "\\caption{Test accuracy for all configurations.}\n"
        "\\begin{tabular}{l c}\n"
        "\\toprule\n"
        "Method & Accuracy \\\\\n"
        "\\midrule\n"
        "baseline\\_resnet18 & \\textbf{0.4816} \\\\\n"
        "baseline\\_resnet50 & 0.4451 \\\\\n"
        "dropout\\_standard & 0.3243 \\\\\n"
        "\\bottomrule\n"
        "\\end{tabular}\n"
        "\\end{table}\n"
        "```\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # 0.4816 is verified → kept
    assert "0.4816" in sanitized
    # 0.4451 and 0.3243 are unverified → replaced with ---
    assert "0.4451" not in sanitized
    assert "0.3243" not in sanitized
    assert "---" in sanitized
    assert report["tables_processed"] >= 1
    assert report["numbers_replaced"] >= 2


def test_sanitize_latex_tabular_hp_table_skipped(run_dir: Path) -> None:
    """BUG-211: LaTeX HP tables should be skipped just like markdown ones."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
    })
    paper = (
        "\\begin{table}[htbp]\n"
        "\\centering\n"
        "\\caption{Training hyperparameters.}\n"
        "\\begin{tabular}{l c}\n"
        "\\toprule\n"
        "Hyperparameter & Value \\\\\n"
        "\\midrule\n"
        "Learning Rate & 0.001 \\\\\n"
        "Batch Size & 128 \\\\\n"
        "Weight Decay & 0.0005 \\\\\n"
        "\\bottomrule\n"
        "\\end{tabular}\n"
        "\\end{table}\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # HP table — all values preserved, table NOT processed
    assert "0.001" in sanitized
    assert "0.0005" in sanitized


def test_sanitize_latex_tabular_with_pm(run_dir: Path) -> None:
    """BUG-211: Numbers with ± in LaTeX cells must be individually checked."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 48.16, "accuracy_std": 0.35},
        "best_run": {"metrics": {"accuracy": 48.16}},
        "condition_summaries": {
            "method_a": {"primary_metric_mean": 48.16, "primary_metric_std": 0.35},
        },
    })
    paper = (
        "\\begin{tabular}{l c}\n"
        "\\toprule\n"
        "Method & Accuracy (mean $\\pm$ std) \\\\\n"
        "\\midrule\n"
        "method\\_a & 48.16 $\\pm$ 0.35 \\\\\n"
        "method\\_b & 32.43 $\\pm$ 0.45 \\\\\n"
        "\\bottomrule\n"
        "\\end{tabular}\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # 48.16 and 0.35 are verified → kept
    assert "48.16" in sanitized
    assert "0.35" in sanitized
    # 32.43 and 0.45 are unverified → replaced
    assert "32.43" not in sanitized
    assert "0.45" not in sanitized


def test_sanitize_latex_tabular_preserves_first_column(run_dir: Path) -> None:
    """BUG-211: First column (method names) must be preserved."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": 0.85},
        "best_run": {"metrics": {"accuracy": 0.85}},
    })
    paper = (
        "\\begin{tabular}{l r r r r}\n"
        "\\toprule\n"
        "Method & Seed 0 & Seed 1 & Seed 2 & Mean \\\\\n"
        "\\midrule\n"
        "resnet\\_18 & 0.4861 & 0.4809 & 0.4777 & 0.4816 \\\\\n"
        "resnet\\_50 & 0.4455 & 0.4459 & 0.4438 & 0.4451 \\\\\n"
        "\\bottomrule\n"
        "\\end{tabular}\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # Method names in first column must be preserved
    assert "resnet\\_18" in sanitized
    assert "resnet\\_50" in sanitized


# ---------------------------------------------------------------------------
# BUG-224: Statistical analysis tables should NOT be sanitized
# ---------------------------------------------------------------------------

def test_sanitize_skips_statistical_analysis_table(run_dir: Path) -> None:
    """BUG-224: Tables with t-statistics, p-values, and effect sizes are
    derived from experiment data and should not be sanitized."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": {"mean": 64.26}},
        "condition_summaries": {"ce": {"metrics": {"accuracy": 64.26}}},
    })
    paper = (
        "## Results\n\n"
        "| Method | Accuracy |\n"
        "|--------|----------|\n"
        "| CE | 64.26 |\n"
        "| SCE | 56.93 |\n\n"
        "## Statistical Analysis\n\n"
        "| Comparison | t-statistic | p-value |\n"
        "|-----------|------------|--------|\n"
        "| CE vs SCE | 7.3267 | 0.0123 |\n"
        "| CE vs GCE | 1.7100 | 0.0569 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # Results table: 64.26 is verified, 56.93 is NOT → gets replaced
    assert "56.93" not in sanitized or "---" in sanitized
    # Statistical table: 7.3267 and 0.0123 are derived → MUST be preserved
    assert "7.3267" in sanitized, "BUG-224: t-statistic was sanitized"
    assert "0.0123" in sanitized, "BUG-224: p-value was sanitized"
    assert "1.7100" in sanitized, "BUG-224: t-statistic was sanitized"
    assert "0.0569" in sanitized, "BUG-224: p-value was sanitized"


def test_sanitize_preserves_common_hp_values(run_dir: Path) -> None:
    """BUG-224: Common HP values like 0.7 should be in the always-allowed set."""
    _write_experiment_summary(run_dir, {
        "metrics_summary": {"accuracy": {"mean": 64.26}},
        "condition_summaries": {"ce": {"metrics": {"accuracy": 64.26}}},
    })
    paper = (
        "| Method | q | Accuracy |\n"
        "|--------|---|----------|\n"
        "| GCE | 0.7 | 64.26 |\n"
        "| GCE-05 | 0.5 | 66.77 |\n"
    )
    sanitized, report = _sanitize_fabricated_data(paper, run_dir)
    # 0.7 should be preserved (always-allowed HP value)
    assert "0.7" in sanitized, "BUG-224: q=0.7 was incorrectly sanitized"
    # 0.5 should also be preserved
    assert "0.5" in sanitized


================================================
FILE: tests/test_rc_sentinel.py
================================================
# pyright: reportPrivateUsage=false
"""Tests for the sentinel watchdog and heartbeat system."""

from __future__ import annotations

import json
import os
import subprocess
from pathlib import Path

import pytest

from researchclaw.pipeline import runner as rc_runner
from researchclaw.pipeline.stages import Stage


# ── Heartbeat writing tests ──


class TestHeartbeatWriting:
    def test_write_heartbeat_creates_file(self, tmp_path: Path) -> None:
        rc_runner._write_heartbeat(tmp_path, Stage.TOPIC_INIT, "run-hb-1")
        hb_path = tmp_path / "heartbeat.json"
        assert hb_path.exists()

    def test_heartbeat_contains_required_fields(self, tmp_path: Path) -> None:
        rc_runner._write_heartbeat(tmp_path, Stage.HYPOTHESIS_GEN, "run-hb-2")
        data = json.loads((tmp_path / "heartbeat.json").read_text())
        assert data["pid"] == os.getpid()
        assert data["last_stage"] == 8
        assert data["last_stage_name"] == "HYPOTHESIS_GEN"
        assert data["run_id"] == "run-hb-2"
        assert "timestamp" in data

    def test_heartbeat_updates_on_each_stage(self, tmp_path: Path) -> None:
        rc_runner._write_heartbeat(tmp_path, Stage.TOPIC_INIT, "run-1")
        data1 = json.loads((tmp_path / "heartbeat.json").read_text())
        rc_runner._write_heartbeat(tmp_path, Stage.PAPER_DRAFT, "run-1")
        data2 = json.loads((tmp_path / "heartbeat.json").read_text())
        assert data2["last_stage"] == 17
        assert data1["last_stage"] == 1


class TestHeartbeatInPipeline:
    def test_pipeline_writes_heartbeat_after_each_stage(
        self,
        monkeypatch: pytest.MonkeyPatch,
        tmp_path: Path,
    ) -> None:
        from researchclaw.adapters import AdapterBundle
        from researchclaw.config import RCConfig
        from researchclaw.pipeline.executor import StageResult
        from researchclaw.pipeline.stages import StageStatus

        data = {
            "project": {"name": "hb-test", "mode": "docs-first"},
            "research": {"topic": "heartbeat testing"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "local"},
            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
            "openclaw_bridge": {},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost/v1",
                "api_key_env": "K",
                "api_key": "k",
            },
        }
        config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
        run_dir = tmp_path / "run"
        run_dir.mkdir()

        call_count = 0

        def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
            nonlocal call_count
            call_count += 1
            if call_count >= 3:
                return StageResult(
                    stage=stage, status=StageStatus.FAILED, artifacts=(), error="stop"
                )
            return StageResult(stage=stage, status=StageStatus.DONE, artifacts=("x.md",))

        monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
        rc_runner.execute_pipeline(
            run_dir=run_dir,
            run_id="hb-test",
            config=config,
            adapters=AdapterBundle(),
        )
        hb_path = run_dir / "heartbeat.json"
        assert hb_path.exists()
        data_out = json.loads(hb_path.read_text())
        assert data_out["run_id"] == "hb-test"


# ── Sentinel script syntax check ──


class TestSentinelScript:
    def test_sentinel_script_exists(self) -> None:
        script = Path(__file__).parent.parent / "sentinel.sh"
        assert script.exists()

    def test_sentinel_script_is_valid_bash(self) -> None:
        script = Path(__file__).parent.parent / "sentinel.sh"
        result = subprocess.run(
            ["bash", "-n", str(script)],
            capture_output=True,
            text=True,
        )
        assert result.returncode == 0, f"Bash syntax error: {result.stderr}"

    def test_sentinel_script_is_executable(self) -> None:
        script = Path(__file__).parent.parent / "sentinel.sh"
        assert os.access(script, os.X_OK)

    def test_sentinel_script_has_shebang(self) -> None:
        script = Path(__file__).parent.parent / "sentinel.sh"
        first_line = script.read_text().splitlines()[0]
        assert first_line.startswith("#!/")

    def test_sentinel_prints_usage_on_no_args(self) -> None:
        script = Path(__file__).parent.parent / "sentinel.sh"
        result = subprocess.run(
            ["bash", str(script)],
            capture_output=True,
            text=True,
        )
        # Should fail because no run_dir argument provided
        assert result.returncode != 0


================================================
FILE: tests/test_rc_stages.py
================================================
import pytest

from researchclaw.pipeline.stages import (
    DECISION_ROLLBACK,
    GATE_ROLLBACK,
    GATE_STAGES,
    MAX_DECISION_PIVOTS,
    NEXT_STAGE,
    PHASE_MAP,
    PREVIOUS_STAGE,
    STAGE_SEQUENCE,
    TRANSITION_MAP,
    Stage,
    StageStatus,
    TransitionEvent,
    TransitionOutcome,
    advance,
    default_rollback_stage,
    gate_required,
)


def test_stage_enum_has_exactly_23_members():
    assert len(Stage) == 23


@pytest.mark.parametrize(
    "index,stage", [(idx, stage) for idx, stage in enumerate(STAGE_SEQUENCE, start=1)]
)
def test_stage_values_follow_sequence_order(index: int, stage: Stage):
    assert int(stage) == index


def test_stage_sequence_contains_all_23_stages_in_order():
    assert len(STAGE_SEQUENCE) == 23
    assert STAGE_SEQUENCE[0] is Stage.TOPIC_INIT
    assert STAGE_SEQUENCE[-1] is Stage.CITATION_VERIFY
    assert tuple(Stage) == STAGE_SEQUENCE


def test_next_stage_boundary_values():
    assert NEXT_STAGE[Stage.TOPIC_INIT] is Stage.PROBLEM_DECOMPOSE
    assert NEXT_STAGE[Stage.EXPORT_PUBLISH] is Stage.CITATION_VERIFY


def test_previous_stage_boundary_values():
    assert PREVIOUS_STAGE[Stage.TOPIC_INIT] is None
    assert PREVIOUS_STAGE[Stage.PROBLEM_DECOMPOSE] is Stage.TOPIC_INIT


def test_gate_stages_matches_expected_set():
    assert GATE_STAGES == frozenset(
        {Stage.LITERATURE_SCREEN, Stage.EXPERIMENT_DESIGN, Stage.QUALITY_GATE}
    )


def test_gate_rollback_map_matches_expected_targets():
    assert GATE_ROLLBACK == {
        Stage.LITERATURE_SCREEN: Stage.LITERATURE_COLLECT,
        Stage.EXPERIMENT_DESIGN: Stage.HYPOTHESIS_GEN,
        Stage.QUALITY_GATE: Stage.PAPER_OUTLINE,
    }


def test_phase_map_has_8_phases_with_expected_membership():
    assert len(PHASE_MAP) == 8
    assert PHASE_MAP["A: Research Scoping"] == (
        Stage.TOPIC_INIT,
        Stage.PROBLEM_DECOMPOSE,
    )
    assert PHASE_MAP["B: Literature Discovery"] == (
        Stage.SEARCH_STRATEGY,
        Stage.LITERATURE_COLLECT,
        Stage.LITERATURE_SCREEN,
        Stage.KNOWLEDGE_EXTRACT,
    )
    assert PHASE_MAP["C: Knowledge Synthesis"] == (
        Stage.SYNTHESIS,
        Stage.HYPOTHESIS_GEN,
    )
    assert PHASE_MAP["D: Experiment Design"] == (
        Stage.EXPERIMENT_DESIGN,
        Stage.CODE_GENERATION,
        Stage.RESOURCE_PLANNING,
    )
    assert PHASE_MAP["E: Experiment Execution"] == (
        Stage.EXPERIMENT_RUN,
        Stage.ITERATIVE_REFINE,
    )
    assert PHASE_MAP["F: Analysis & Decision"] == (
        Stage.RESULT_ANALYSIS,
        Stage.RESEARCH_DECISION,
    )
    assert PHASE_MAP["G: Paper Writing"] == (
        Stage.PAPER_OUTLINE,
        Stage.PAPER_DRAFT,
        Stage.PEER_REVIEW,
        Stage.PAPER_REVISION,
    )
    assert PHASE_MAP["H: Finalization"] == (
        Stage.QUALITY_GATE,
        Stage.KNOWLEDGE_ARCHIVE,
        Stage.EXPORT_PUBLISH,
        Stage.CITATION_VERIFY,
    )


def test_phase_map_covers_all_stages_exactly_once():
    flattened = tuple(stage for stages in PHASE_MAP.values() for stage in stages)
    assert len(flattened) == 23
    assert set(flattened) == set(Stage)


@pytest.mark.parametrize(
    "status",
    [StageStatus.PENDING, StageStatus.RETRYING, StageStatus.PAUSED],
)
def test_start_event_transitions_to_running_from_allowed_states(status: StageStatus):
    outcome = advance(Stage.EXPERIMENT_RUN, status, TransitionEvent.START)

    assert outcome.status is StageStatus.RUNNING
    assert outcome.next_stage is Stage.EXPERIMENT_RUN


def test_succeed_event_on_non_gate_stage_transitions_to_done():
    outcome = advance(
        Stage.SEARCH_STRATEGY,
        StageStatus.RUNNING,
        TransitionEvent.SUCCEED,
        hitl_required_stages=(5, 9, 20),
    )

    assert outcome.status is StageStatus.DONE
    assert outcome.next_stage is Stage.LITERATURE_COLLECT
    assert outcome.checkpoint_required is True
    assert outcome.decision == "proceed"


def test_succeed_event_on_gate_stage_transitions_to_blocked_approval():
    outcome = advance(
        Stage.LITERATURE_SCREEN,
        StageStatus.RUNNING,
        TransitionEvent.SUCCEED,
        hitl_required_stages=(5, 20),
    )

    assert outcome.status is StageStatus.BLOCKED_APPROVAL
    assert outcome.next_stage is Stage.LITERATURE_SCREEN
    assert outcome.checkpoint_required is False
    assert outcome.decision == "block"


def test_approve_event_transitions_blocked_stage_to_done():
    outcome = advance(
        Stage.EXPERIMENT_DESIGN,
        StageStatus.BLOCKED_APPROVAL,
        TransitionEvent.APPROVE,
        hitl_required_stages=(5, 9, 20),
    )

    assert outcome.status is StageStatus.DONE
    assert outcome.next_stage is Stage.CODE_GENERATION
    assert outcome.checkpoint_required is True


def test_reject_event_rolls_back_to_default_gate_mapping():
    outcome = advance(
        Stage.QUALITY_GATE,
        StageStatus.BLOCKED_APPROVAL,
        TransitionEvent.REJECT,
        hitl_required_stages=(5, 9, 20),
    )

    assert outcome.status is StageStatus.PENDING
    assert outcome.stage is Stage.PAPER_OUTLINE
    assert outcome.next_stage is Stage.PAPER_OUTLINE
    assert outcome.rollback_stage is Stage.PAPER_OUTLINE
    assert outcome.checkpoint_required is True
    assert outcome.decision == "pivot"


def test_reject_event_uses_explicit_rollback_stage_when_provided():
    outcome = advance(
        Stage.PAPER_REVISION,
        StageStatus.BLOCKED_APPROVAL,
        TransitionEvent.REJECT,
        rollback_stage=Stage.PAPER_OUTLINE,
    )

    assert outcome.status is StageStatus.PENDING
    assert outcome.stage is Stage.PAPER_OUTLINE
    assert outcome.next_stage is Stage.PAPER_OUTLINE
    assert outcome.rollback_stage is Stage.PAPER_OUTLINE


def test_timeout_event_transitions_to_paused_with_block_decision():
    outcome = advance(
        Stage.LITERATURE_SCREEN,
        StageStatus.BLOCKED_APPROVAL,
        TransitionEvent.TIMEOUT,
    )

    assert outcome.status is StageStatus.PAUSED
    assert outcome.next_stage is Stage.LITERATURE_SCREEN
    assert outcome.checkpoint_required is True
    assert outcome.decision == "block"


def test_fail_event_transitions_running_to_failed_with_retry_decision():
    outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.RUNNING, TransitionEvent.FAIL)

    assert outcome.status is StageStatus.FAILED
    assert outcome.next_stage is Stage.EXPERIMENT_RUN
    assert outcome.checkpoint_required is True
    assert outcome.decision == "retry"


def test_retry_event_transitions_failed_to_retrying():
    outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.FAILED, TransitionEvent.RETRY)

    assert outcome.status is StageStatus.RETRYING
    assert outcome.next_stage is Stage.EXPERIMENT_RUN
    assert outcome.decision == "retry"


def test_resume_event_transitions_paused_to_running():
    outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.PAUSED, TransitionEvent.RESUME)

    assert outcome.status is StageStatus.RUNNING
    assert outcome.next_stage is Stage.EXPERIMENT_RUN


def test_pause_event_transitions_failed_to_paused():
    outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.FAILED, TransitionEvent.PAUSE)

    assert outcome.status is StageStatus.PAUSED
    assert outcome.next_stage is Stage.EXPERIMENT_RUN
    assert outcome.checkpoint_required is True
    assert outcome.decision == "block"


def test_invalid_transition_raises_value_error():
    with pytest.raises(ValueError, match="Unsupported transition"):
        _ = advance(Stage.TOPIC_INIT, StageStatus.DONE, TransitionEvent.START)


def test_advance_rejects_unknown_transition_event_string():
    with pytest.raises(ValueError, match="not a valid TransitionEvent"):
        _ = advance(Stage.TOPIC_INIT, StageStatus.PENDING, "unknown")


@pytest.mark.parametrize("stage", tuple(GATE_STAGES))
def test_gate_required_for_gate_stages_with_default_config(stage: Stage):
    assert gate_required(stage, None) is True


@pytest.mark.parametrize("stage", tuple(GATE_STAGES))
def test_gate_required_respects_hitl_stage_subset(stage: Stage):
    required = (5, 20)
    assert gate_required(stage, required) is (int(stage) in required)


@pytest.mark.parametrize("stage", tuple(s for s in Stage if s not in GATE_STAGES))
def test_gate_required_is_false_for_non_gate_stages(stage: Stage):
    assert gate_required(stage, (5, 9, 20)) is False


@pytest.mark.parametrize(
    "stage,expected",
    [
        (Stage.LITERATURE_SCREEN, Stage.LITERATURE_COLLECT),
        (Stage.EXPERIMENT_DESIGN, Stage.HYPOTHESIS_GEN),
        (Stage.QUALITY_GATE, Stage.PAPER_OUTLINE),
    ],
)
def test_default_rollback_stage_for_known_gate_mappings(stage: Stage, expected: Stage):
    assert default_rollback_stage(stage) is expected


def test_default_rollback_stage_for_unknown_stage_uses_previous_stage():
    assert default_rollback_stage(Stage.PAPER_DRAFT) is Stage.PAPER_OUTLINE


def test_default_rollback_stage_for_first_stage_returns_self():
    assert default_rollback_stage(Stage.TOPIC_INIT) is Stage.TOPIC_INIT


def test_transition_outcome_field_values_are_exposed():
    outcome = TransitionOutcome(
        stage=Stage.TOPIC_INIT,
        status=StageStatus.RUNNING,
        next_stage=Stage.TOPIC_INIT,
        rollback_stage=Stage.TOPIC_INIT,
        checkpoint_required=True,
        decision="block",
    )

    assert outcome.checkpoint_required is True
    assert outcome.decision == "block"


def test_sequence_and_neighbor_maps_are_consistent_for_all_stages():
    for idx, stage in enumerate(STAGE_SEQUENCE):
        expected_prev = STAGE_SEQUENCE[idx - 1] if idx > 0 else None
        expected_next = (
            STAGE_SEQUENCE[idx + 1] if idx + 1 < len(STAGE_SEQUENCE) else None
        )
        assert PREVIOUS_STAGE[stage] is expected_prev
        assert NEXT_STAGE[stage] is expected_next


def test_transition_map_covers_all_stage_status_values():
    assert set(TRANSITION_MAP.keys()) == set(StageStatus)
    for source_status, targets in TRANSITION_MAP.items():
        assert isinstance(targets, frozenset)
        assert all(target in StageStatus for target in targets)
        if source_status is StageStatus.DONE:
            assert targets == frozenset()


# ── DECISION_ROLLBACK tests ──


def test_decision_rollback_has_pivot_and_refine():
    assert "pivot" in DECISION_ROLLBACK
    assert "refine" in DECISION_ROLLBACK


def test_decision_rollback_pivot_targets_hypothesis_gen():
    assert DECISION_ROLLBACK["pivot"] is Stage.HYPOTHESIS_GEN


def test_decision_rollback_refine_targets_iterative_refine():
    assert DECISION_ROLLBACK["refine"] is Stage.ITERATIVE_REFINE


def test_max_decision_pivots_is_positive():
    assert MAX_DECISION_PIVOTS >= 1


================================================
FILE: tests/test_rc_templates.py
================================================
"""Unit tests for researchclaw.templates — conference templates + MD→LaTeX converter."""

from __future__ import annotations

import threading

import pytest

from researchclaw.templates.conference import (
    CONFERENCE_REGISTRY,
    ConferenceTemplate,
    get_template,
    list_conferences,
    NEURIPS_2024,
    NEURIPS_2025,
    ICLR_2025,
    ICLR_2026,
    ICML_2025,
    ICML_2026,
)
from researchclaw.templates.converter import (
    markdown_to_latex,
    _parse_sections,
    _extract_title,
    _extract_abstract,
    _convert_inline,
    _escape_latex,
    _escape_algo_line,
    _render_code_block,
    _build_body,
    _render_table,
    _parse_table_row,
    _parse_alignments,
    _render_itemize,
    _render_enumerate,
    _reset_render_counters,
    _next_table_num,
    _next_figure_num,
    check_paper_completeness,  # noqa: F401
)


# =====================================================================
# conference.py tests
# =====================================================================


class TestConferenceTemplate:
    """Tests for ConferenceTemplate dataclass."""

    def test_neurips_basic_fields(self) -> None:
        t = NEURIPS_2024
        assert t.name == "neurips_2024"
        assert t.display_name == "NeurIPS 2024"
        assert t.year == 2024
        assert t.document_class == "article"
        assert t.style_package == "neurips_2024"
        assert t.columns == 1
        assert t.author_format == "neurips"
        assert t.bib_style == "plainnat"

    def test_iclr_basic_fields(self) -> None:
        t = ICLR_2025
        assert t.name == "iclr_2025"
        assert t.year == 2025
        assert t.style_package == "iclr2025_conference"
        assert t.bib_style == "iclr2025_conference"
        assert t.columns == 1
        assert t.author_format == "iclr"

    def test_icml_basic_fields(self) -> None:
        t = ICML_2025
        assert t.name == "icml_2025"
        assert t.year == 2025
        assert t.style_package == "icml2025"
        assert t.columns == 2
        assert t.author_format == "icml"
        assert t.bib_style == "icml2025"

    def test_frozen(self) -> None:
        with pytest.raises(AttributeError):
            NEURIPS_2024.name = "hacked"  # type: ignore[misc]


class TestRenderPreamble:
    """Tests for ConferenceTemplate.render_preamble()."""

    def test_neurips_preamble_structure(self) -> None:
        tex = NEURIPS_2024.render_preamble("My Title", "J. Doe", "An abstract.")
        assert r"\documentclass{article}" in tex
        assert r"\usepackage[preprint]{neurips_2024}" in tex
        assert r"\title{My Title}" in tex
        assert r"\author{J. Doe}" in tex
        assert r"\begin{abstract}" in tex
        assert "An abstract." in tex
        assert r"\end{abstract}" in tex
        assert r"\begin{document}" in tex
        assert r"\maketitle" in tex

    def test_iclr_preamble_no_options(self) -> None:
        tex = ICLR_2025.render_preamble("Title", "Author", "Abstract")
        assert r"\documentclass{article}" in tex  # no options
        assert r"\usepackage{iclr2025_conference}" in tex

    def test_icml_author_block(self) -> None:
        tex = ICML_2025.render_preamble("Title", "Alice", "Abstract")
        assert r"\begin{icmlauthorlist}" in tex
        assert r"\icmlauthor{Alice}{aff1}" in tex
        assert r"\end{icmlauthorlist}" in tex
        assert r"\icmlaffiliation{aff1}{Affiliation}" in tex

    def test_icml_preamble_extra(self) -> None:
        tex = ICML_2025.render_preamble("Title", "Author", "Abstract")
        assert r"\icmltitlerunning{Title}" in tex


class TestRenderFooter:
    """Tests for ConferenceTemplate.render_footer()."""

    def test_neurips_footer(self) -> None:
        tex = NEURIPS_2024.render_footer("refs")
        assert r"\bibliographystyle{plainnat}" in tex
        assert r"\bibliography{refs}" in tex
        assert r"\end{document}" in tex

    def test_icml_footer(self) -> None:
        tex = ICML_2025.render_footer()
        assert r"\bibliographystyle{icml2025}" in tex
        assert r"\bibliography{references}" in tex

    def test_default_bib_file(self) -> None:
        tex = NEURIPS_2024.render_footer()
        assert r"\bibliography{references}" in tex


class TestGetTemplate:
    """Tests for get_template() lookup."""

    def test_full_name(self) -> None:
        assert get_template("neurips_2024") is NEURIPS_2024

    def test_short_alias(self) -> None:
        assert get_template("neurips") is NEURIPS_2025
        assert get_template("iclr") is ICLR_2026
        assert get_template("icml") is ICML_2026

    def test_case_insensitive(self) -> None:
        assert get_template("NeurIPS") is NEURIPS_2025
        assert get_template("ICML_2026") is ICML_2026

    def test_dash_and_space_normalization(self) -> None:
        assert get_template("neurips-2025") is NEURIPS_2025
        assert get_template("icml 2026") is ICML_2026

    def test_unknown_raises(self) -> None:
        with pytest.raises(KeyError, match="Unknown conference"):
            get_template("aaai_2025")


class TestListConferences:
    """Tests for list_conferences()."""

    def test_returns_canonical_names(self) -> None:
        names = list_conferences()
        assert "neurips_2025" in names
        assert "iclr_2026" in names
        assert "icml_2026" in names
        # Should be deduplicated — no aliases (6 conference + 1 generic)
        assert len(names) == 7

    def test_sorted(self) -> None:
        names = list_conferences()
        assert names == sorted(names)


class TestConferenceRegistry:
    """Tests for CONFERENCE_REGISTRY dict."""

    def test_all_aliases_resolve(self) -> None:
        for key, tpl in CONFERENCE_REGISTRY.items():
            assert isinstance(tpl, ConferenceTemplate)
            assert tpl.name  # not empty


# =====================================================================
# converter.py tests
# =====================================================================


class TestParseSections:
    """Tests for _parse_sections()."""

    def test_empty(self) -> None:
        sections = _parse_sections("")
        assert len(sections) == 1
        assert sections[0].level == 1
        assert sections[0].body == ""

    def test_single_heading(self) -> None:
        md = "# Introduction\nHello world"
        sections = _parse_sections(md)
        assert len(sections) == 1
        assert sections[0].level == 1
        assert sections[0].heading == "Introduction"
        assert "Hello world" in sections[0].body

    def test_multiple_headings(self) -> None:
        md = "# Title\nfoo\n## Method\nbar\n### Details\nbaz"
        sections = _parse_sections(md)
        assert len(sections) == 3
        assert sections[0].heading == "Title"
        assert sections[1].heading == "Method"
        assert sections[2].heading == "Details"

    def test_preamble_before_heading(self) -> None:
        md = "Some text before\n\n# First\nBody"
        sections = _parse_sections(md)
        assert len(sections) == 2
        assert sections[0].level == 0
        assert "Some text before" in sections[0].body

    def test_heading_lower(self) -> None:
        md = "# Abstract\nContent"
        sections = _parse_sections(md)
        assert sections[0].heading_lower == "abstract"


class TestExtractTitle:
    """Tests for _extract_title()."""

    def test_bold_title_after_heading(self) -> None:
        md = "# Title\n**My Paper**\n\n# Abstract\nblah"
        sections = _parse_sections(md)
        assert _extract_title(sections, md) == "My Paper"

    def test_first_non_meta_h1(self) -> None:
        md = "# Introduction\nSome text"
        sections = _parse_sections(md)
        assert _extract_title(sections, md) == "Introduction"

    def test_fallback(self) -> None:
        sections = _parse_sections("")
        assert _extract_title(sections, "") == "Untitled Paper"


class TestExtractAbstract:
    """Tests for _extract_abstract()."""

    def test_from_h1(self) -> None:
        md = "# Abstract\nThis is the abstract.\n\n# Intro\nBody"
        sections = _parse_sections(md)
        assert "This is the abstract." in _extract_abstract(sections)

    def test_from_h2(self) -> None:
        md = "# Title\nfoo\n## Abstract\nAbstract text.\n## Intro"
        sections = _parse_sections(md)
        assert "Abstract text." in _extract_abstract(sections)

    def test_missing_abstract(self) -> None:
        md = "# Introduction\nNo abstract here"
        sections = _parse_sections(md)
        assert _extract_abstract(sections) == ""


class TestConvertInline:
    """Tests for _convert_inline()."""

    def test_bold(self) -> None:
        assert r"\textbf{bold}" in _convert_inline("**bold**")

    def test_italic(self) -> None:
        assert r"\textit{italic}" in _convert_inline("*italic*")

    def test_inline_code(self) -> None:
        assert r"\texttt{code}" in _convert_inline("`code`")

    def test_link(self) -> None:
        result = _convert_inline("[text](http://example.com)")
        assert r"\href{http://example.com}{text}" in result

    def test_special_chars_escaped(self) -> None:
        result = _convert_inline("100% done & 5# items")
        assert r"100\% done \& 5\# items" in result

    def test_math_preserved(self) -> None:
        result = _convert_inline(r"where \(x + y\) is given")
        assert r"\(x + y\)" in result

    def test_cite_preserved(self) -> None:
        result = _convert_inline(r"as shown by \cite{doe2024}")
        assert r"\cite{doe2024}" in result

    def test_dollar_math_preserved(self) -> None:
        result = _convert_inline("the value $x^2$ is")
        assert "$x^2$" in result

    def test_pre_escaped_underscore_not_doubled(self) -> None:
        """BUG-182: LLM pre-escapes underscores → must NOT double-escape to \\\\_."""
        result = _convert_inline(r"RawObservation\_PPO\_WithNorm")
        assert r"\\_" not in result, f"Double-escaped: {result}"
        assert r"\_" in result

    def test_pre_escaped_underscore_near_math(self) -> None:
        """BUG-182: Pre-escaped underscore adjacent to math must not break."""
        result = _convert_inline(
            r"RawObs\_PPO. Statistics \(\mu_t\) are given"
        )
        assert r"\\_" not in result
        assert r"\_" in result
        assert r"\(\mu_t\)" in result

    def test_pre_escaped_hash_not_doubled(self) -> None:
        """BUG-182: Pre-escaped hash should not be double-escaped."""
        result = _convert_inline(r"Section \#3 details")
        assert r"\\#" not in result
        assert r"\#" in result


class TestEscapeLatex:
    """Tests for _escape_latex()."""

    def test_special_chars(self) -> None:
        assert r"\#" in _escape_latex("#")
        assert r"\%" in _escape_latex("%")
        assert r"\&" in _escape_latex("&")
        assert r"\_" in _escape_latex("_")

    def test_math_not_escaped(self) -> None:
        result = _escape_latex(r"value \(x_1\) here")
        assert r"\(x_1\)" in result  # underscore inside math preserved


class TestBuildBody:
    """Tests for _build_body()."""

    def test_skips_title_and_abstract(self) -> None:
        md = "# Title\nfoo\n# Abstract\nbar\n# Introduction\nbaz"
        sections = _parse_sections(md)
        body = _build_body(sections)
        assert r"\section{Introduction}" in body
        assert "baz" in body
        # Title and abstract should not appear as sections
        assert r"\section{Title}" not in body
        assert r"\section{Abstract}" not in body

    def test_subsection_promoted_when_all_h2(self) -> None:
        """T1.3: When all body sections are H2, they should be promoted to \\section."""
        md = "## Method\ntext"
        sections = _parse_sections(md)
        body = _build_body(sections)
        # All-H2 document → auto-promoted to \section
        assert r"\section{Method}" in body

    def test_h2_promoted_under_h1_title(self) -> None:
        """When title occupies H1, H2 body sections promote to \\section."""
        md = "# My Paper\ntitle body\n## Method\ntext"
        sections = _parse_sections(md)
        body = _build_body(sections, title="My Paper")
        assert r"\section{Method}" in body

    def test_subsubsection(self) -> None:
        md = "## Intro\nintro\n### Details\ntext"
        sections = _parse_sections(md)
        body = _build_body(sections)
        # H2 promoted to \section, H3 promoted to \subsection
        assert r"\subsection{Details}" in body


class TestListRendering:
    """Tests for bullet and numbered list rendering."""

    def test_bullet_list(self) -> None:
        items = ["First item", "Second item"]
        result = _render_itemize(items)
        assert r"\begin{itemize}" in result
        assert r"\item First item" in result
        assert r"\item Second item" in result
        assert r"\end{itemize}" in result

    def test_numbered_list(self) -> None:
        items = ["Step one", "Step two"]
        result = _render_enumerate(items)
        assert r"\begin{enumerate}" in result
        assert r"\item Step one" in result
        assert r"\end{enumerate}" in result


class TestTableRendering:
    """Tests for Markdown table → LaTeX tabular conversion."""

    def test_parse_table_row(self) -> None:
        assert _parse_table_row("| a | b | c |") == ["a", "b", "c"]

    def test_parse_alignments(self) -> None:
        assert _parse_alignments("| --- | :---: | ---: |", 3) == ["l", "c", "r"]

    def test_render_simple_table(self) -> None:
        lines = [
            "| Name | Value |",
            "| --- | --- |",
            "| A | 1 |",
            "| B | 2 |",
        ]
        result = _render_table(lines)
        assert r"\begin{table}" in result
        assert r"\begin{tabular}{ll}" in result
        assert r"\toprule" in result
        assert r"\textbf{Name}" in result
        assert r"\midrule" in result
        assert r"\bottomrule" in result
        assert r"\end{tabular}" in result
        assert r"\end{table}" in result

    def test_render_counters_are_thread_local(self) -> None:
        results: list[tuple[int, int, int]] = []
        lock = threading.Lock()

        def worker() -> None:
            _reset_render_counters()
            value = (_next_table_num(), _next_table_num(), _next_figure_num())
            with lock:
                results.append(value)

        threads = [threading.Thread(target=worker) for _ in range(4)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

        assert results == [(1, 2, 1)] * 4


# =====================================================================
# markdown_to_latex integration tests
# =====================================================================


class TestMarkdownToLatex:
    """Integration tests for the full conversion pipeline."""

    SAMPLE_MD = (
        "# Title\n"
        "**My Great Paper**\n\n"
        "# Abstract\n"
        "This is the abstract.\n\n"
        "# Introduction\n"
        "We study the problem of RL.\n\n"
        "## Related Work\n"
        "Prior work includes **many** approaches.\n\n"
        "# Method\n"
        "Our method uses \\(f(x) = x^2\\) as the objective.\n\n"
        "# Results\n"
        "- Result 1\n"
        "- Result 2\n\n"
        "# Conclusion\n"
        "We conclude.\n\n"
        "# References\n"
        "1. Doe et al. (2024)\n"
    )

    def test_neurips_full(self) -> None:
        tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024)
        assert r"\documentclass{article}" in tex
        assert r"\usepackage[preprint]{neurips_2024}" in tex
        assert r"\title{My Great Paper}" in tex
        assert r"\begin{abstract}" in tex
        assert "This is the abstract." in tex
        assert r"\section{Introduction}" in tex
        assert r"\subsection{Related Work}" in tex
        assert r"\section{Method}" in tex
        assert r"\begin{itemize}" in tex
        assert r"\bibliographystyle{plainnat}" in tex
        assert r"\end{document}" in tex

    def test_iclr_full(self) -> None:
        tex = markdown_to_latex(self.SAMPLE_MD, ICLR_2025)
        assert r"\usepackage{iclr2025_conference}" in tex
        assert r"\bibliographystyle{iclr2025_conference}" in tex

    def test_icml_full(self) -> None:
        tex = markdown_to_latex(self.SAMPLE_MD, ICML_2025, authors="Alice")
        assert r"\begin{icmlauthorlist}" in tex
        assert r"\icmlauthor{Alice}{aff1}" in tex
        assert r"\bibliographystyle{icml2025}" in tex

    def test_custom_title_override(self) -> None:
        tex = markdown_to_latex(
            "# Abstract\nblah\n# Intro\nbody",
            NEURIPS_2024,
            title="Override Title",
        )
        assert r"\title{Override Title}" in tex

    def test_custom_authors(self) -> None:
        tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024, authors="Jane Doe")
        assert r"\author{Jane Doe}" in tex

    def test_custom_bib_file(self) -> None:
        tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024, bib_file="my_refs")
        assert r"\bibliography{my_refs}" in tex

    def test_math_preserved_in_output(self) -> None:
        md = "# Abstract\nabs\n# Method\n\\(f(x)\\) and \\[E = mc^2\\]"
        tex = markdown_to_latex(md, NEURIPS_2024, title="T")
        assert r"\(f(x)\)" in tex
        assert r"\[E = mc^2\]" in tex

    def test_empty_paper(self) -> None:
        tex = markdown_to_latex("", NEURIPS_2024, title="Empty")
        assert r"\begin{document}" in tex
        assert r"\end{document}" in tex

    def test_display_math_block(self) -> None:
        md = "# Abstract\nabs\n# Method\n\\[\nx = y + z\n\\]"
        tex = markdown_to_latex(md, NEURIPS_2024, title="T")
        assert "x = y + z" in tex

    def test_code_block(self) -> None:
        md = "# Abstract\nabs\n# Method\n```python\nprint('hello')\n```"
        tex = markdown_to_latex(md, NEURIPS_2024, title="T")
        assert r"\begin{verbatim}" in tex
        assert "print('hello')" in tex
        assert r"\end{verbatim}" in tex

    def test_table_in_paper(self) -> None:
        md = (
            "# Abstract\nabs\n"
            "# Results\n"
            "| Model | Score |\n"
            "| --- | --- |\n"
            "| Ours | 95.0 |\n"
        )
        tex = markdown_to_latex(md, NEURIPS_2024, title="T")
        assert r"\begin{tabular}" in tex
        assert r"\textbf{Model}" in tex


# =====================================================================
# ExportConfig tests
# =====================================================================


class TestExportConfig:
    """Tests for ExportConfig in config.py."""

    def test_default_values(self) -> None:
        from researchclaw.config import ExportConfig

        ec = ExportConfig()
        assert ec.target_conference == "neurips_2025"
        assert ec.authors == "Anonymous"
        assert ec.bib_file == "references"

    def test_frozen(self) -> None:
        from researchclaw.config import ExportConfig

        ec = ExportConfig()
        with pytest.raises(AttributeError):
            ec.target_conference = "icml"  # type: ignore[misc]

    def test_rcconfig_has_export(self) -> None:
        from researchclaw.config import RCConfig

        cfg = RCConfig.load("config.researchclaw.example.yaml", check_paths=False)
        assert hasattr(cfg, "export")
        assert cfg.export.target_conference == "neurips_2025"

    def test_rcconfig_export_from_dict(self) -> None:
        from researchclaw.config import RCConfig
        import yaml
        from pathlib import Path

        data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text())
        data["export"] = {
            "target_conference": "icml_2025",
            "authors": "Test Author",
            "bib_file": "mybib",
        }
        cfg = RCConfig.from_dict(data, check_paths=False)
        assert cfg.export.target_conference == "icml_2025"
        assert cfg.export.authors == "Test Author"
        assert cfg.export.bib_file == "mybib"


# =====================================================================
# hitl_required_stages validation update test
# =====================================================================


class TestHitlStageValidation:
    """Test that hitl_required_stages now accepts up to stage 23."""

    def test_stage_23_valid(self) -> None:
        from researchclaw.config import validate_config
        import yaml
        from pathlib import Path

        data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text())
        data.setdefault("security", {})["hitl_required_stages"] = [1, 22, 23]
        result = validate_config(data, check_paths=False)
        assert result.ok, f"Errors: {result.errors}"

    def test_get_style_files_returns_bundled_sty(self) -> None:
        """Each conference template bundles at least one .sty file."""
        for name in ["neurips_2025", "neurips_2024", "iclr_2026", "iclr_2025", "icml_2026", "icml_2025"]:
            tpl = get_template(name)
            files = tpl.get_style_files()
            assert len(files) >= 1, f"No style files for {name}"
            sty_names = [f.name for f in files]
            assert any(f.endswith(".sty") for f in sty_names), f"No .sty file for {name}"

    def test_iclr_icml_have_bst_files(self) -> None:
        """ICLR and ICML templates bundle custom .bst files."""
        for name in ["iclr_2026", "iclr_2025", "icml_2026", "icml_2025"]:
            tpl = get_template(name)
            files = tpl.get_style_files()
            bst_names = [f.name for f in files if f.suffix == ".bst"]
            assert len(bst_names) >= 1, f"No .bst file for {name}"

    def test_stage_24_invalid(self) -> None:
        from researchclaw.config import validate_config
        import yaml
        from pathlib import Path

        data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text())
        data.setdefault("security", {})["hitl_required_stages"] = [24]
        result = validate_config(data, check_paths=False)
        assert not result.ok
        assert any("24" in e for e in result.errors)


# =====================================================================
# check_paper_completeness — section word count + bullet density checks
# =====================================================================


class TestCompletenessWordCountAndBullets:
    """Tests for new per-section word count and bullet density checks."""

    @staticmethod
    def _make_sections(section_specs: list[tuple[str, int, bool]]) -> list:
        """Build _Section objects from (heading, word_count, use_bullets) specs."""
        results = []
        for heading, wc, bullets in section_specs:
            if bullets:
                lines = [f"- Point number {i}" for i in range(wc // 3)]
                body = "\n".join(lines)
            else:
                body = " ".join(["word"] * wc)
            results.append(
                type("_Section", (), {
                    "level": 1,
                    "heading": heading,
                    "heading_lower": heading.lower(),
                    "body": body,
                })()
            )
        return results

    def test_completeness_section_word_count_short(self) -> None:
        """A Method section with only 100 words triggers a warning."""
        secs = self._make_sections([
            ("Title", 5, False),
            ("Abstract", 200, False),
            ("Introduction", 900, False),
            ("Related Work", 700, False),
            ("Method", 100, False),
            ("Experiments", 1000, False),
            ("Results", 700, False),
            ("Conclusion", 250, False),
        ])
        warns = check_paper_completeness(secs)
        method_warns = [w for w in warns if "Method" in w and "words" in w]
        assert len(method_warns) >= 1, f"Expected word count warning, got: {warns}"

    def test_completeness_bullet_density(self) -> None:
        """A Method section full of bullet points triggers a warning."""
        secs = self._make_sections([
            ("Title", 5, False),
            ("Abstract", 200, False),
            ("Introduction", 900, False),
            ("Related Work", 700, False),
            ("Method", 300, True),
            ("Experiments", 1000, False),
            ("Results", 700, False),
            ("Conclusion", 250, False),
        ])
        warns = check_paper_completeness(secs)
        bullet_warns = [w for w in warns if "bullet" in w.lower() and "Method" in w]
        assert len(bullet_warns) >= 1, f"Expected bullet warning, got: {warns}"


# =====================================================================
# BUG-177: Algorithm pseudocode escaping tests
# =====================================================================


class TestAlgorithmEscaping:
    """Tests for _escape_algo_line and algorithm rendering in _render_code_block."""

    def test_escape_underscore(self) -> None:
        assert r"psi\_1" in _escape_algo_line("psi_1")

    def test_escape_hash_comment(self) -> None:
        result = _escape_algo_line("x = y  # update rule")
        assert r"\COMMENT{update rule}" in result
        assert "x = y" in result

    def test_fullline_hash_comment(self) -> None:
        result = _escape_algo_line("# Initialize buffer")
        assert result == r"\COMMENT{Initialize buffer}"

    def test_escape_percent(self) -> None:
        assert r"\%" in _escape_algo_line("accuracy 95%")

    def test_escape_ampersand(self) -> None:
        assert r"\&" in _escape_algo_line("x & y")

    def test_preserve_latex_commands(self) -> None:
        result = _escape_algo_line(r"Set $x = \alpha$ and update")
        assert r"$x = \alpha$" in result

    def test_render_code_block_algo_escapes(self) -> None:
        code = (
            "Initialize theta_1, theta_2\n"
            "for t = 1 to T do\n"
            "  Sample batch B  # prioritized\n"
        )
        result = _render_code_block("algorithm", code)
        assert r"\begin{algorithm}" in result
        assert r"\begin{algorithmic}" in result
        assert r"theta\_1" in result
        assert r"\COMMENT{prioritized}" in result

    def test_render_code_block_verbatim_no_escape(self) -> None:
        """Non-algorithm code blocks should use verbatim (no escaping)."""
        code = "x_1 = y_2  # comment"
        result = _render_code_block("python", code)
        assert r"\begin{verbatim}" in result
        assert "x_1" in result  # NOT escaped in verbatim


================================================
FILE: tests/test_rc_validator.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false
from __future__ import annotations

import pytest

from researchclaw.experiment.validator import (
    BANNED_MODULES,
    DANGEROUS_BUILTINS,
    DANGEROUS_CALLS,
    CodeValidation,
    ValidationIssue,
    check_filename_collisions,
    extract_imports,
    format_issues_for_llm,
    validate_code,
    validate_imports,
    validate_security,
    validate_syntax,
)


def _call_source(name: str) -> str:
    top = name.split(".")[0]
    lines: list[str] = []
    if top in {"os", "subprocess", "shutil"}:
        lines.append(f"import {top}")
    lines.append(f"{name}()")
    return "\n".join(lines)


def test_validate_syntax_accepts_valid_code():
    result = validate_syntax("x = 1\nif x > 0:\n    x += 1")

    assert result.ok is True
    assert result.issues == []


def test_validate_syntax_reports_syntax_error_with_location():
    result = validate_syntax("def bad(:\n    pass")

    assert result.ok is False
    assert len(result.issues) == 1
    issue = result.issues[0]
    assert issue.severity == "error"
    assert issue.category == "syntax"
    assert issue.line == 1
    assert issue.col is not None
    assert issue.message


@pytest.mark.parametrize("code", ["", "   \n\t  ", "# comment only\n# still comment"])
def test_validate_syntax_accepts_empty_whitespace_and_comment_only(code: str):
    result = validate_syntax(code)

    assert result.ok is True
    assert result.issues == []


def test_validate_security_accepts_safe_code():
    code = 'import os\nvalue = os.path.join("a", "b")\nprint(value)'
    result = validate_security(code)

    assert result.ok is True
    assert result.issues == []


def test_validate_security_skips_when_code_has_syntax_error():
    result = validate_security("def broken(:\n    pass")

    assert result.ok is True
    assert result.issues == []


@pytest.mark.parametrize("builtin_name", sorted(DANGEROUS_BUILTINS))
def test_validate_security_flags_every_dangerous_builtin_call(builtin_name: str):
    if builtin_name == "__import__":
        code = '__import__("os")'
    elif builtin_name == "compile":
        code = 'compile("x = 1", "<string>", "exec")'
    else:
        code = f'{builtin_name}("print(1)")'

    result = validate_security(code)

    assert len(result.issues) == 1
    issue = result.issues[0]
    assert issue.severity == "error"
    assert issue.category == "security"
    assert issue.message == f"Dangerous built-in call: {builtin_name}()"


@pytest.mark.parametrize("call_name", sorted(DANGEROUS_CALLS))
def test_validate_security_flags_every_dangerous_call(call_name: str):
    result = validate_security(_call_source(call_name))

    messages = [issue.message for issue in result.issues]
    assert f"Dangerous call: {call_name}()" in messages
    assert all(issue.severity == "error" for issue in result.issues)
    assert all(issue.category == "security" for issue in result.issues)


@pytest.mark.parametrize("module_name", sorted(BANNED_MODULES))
def test_validate_security_flags_every_banned_import(module_name: str):
    result = validate_security(f"import {module_name}")

    assert len(result.issues) == 1
    issue = result.issues[0]
    assert issue.severity == "error"
    assert issue.category == "security"
    assert issue.message == f"Banned module import: {module_name}"


@pytest.mark.parametrize("module_name", sorted(BANNED_MODULES))
def test_validate_security_flags_every_banned_from_import(module_name: str):
    result = validate_security(f"from {module_name} import x")

    assert len(result.issues) == 1
    issue = result.issues[0]
    assert issue.severity == "error"
    assert issue.category == "security"
    assert issue.message == f"Banned module import: from {module_name}"


def test_validate_imports_recognizes_stdlib_modules_by_default():
    result = validate_imports("import json\nfrom math import sqrt")

    assert result.ok is True
    assert result.warnings == []


def test_validate_imports_warns_for_unavailable_package():
    result = validate_imports("import totally_missing_pkg")

    assert result.ok is True
    assert len(result.warnings) == 1
    warning = result.warnings[0]
    assert warning.severity == "warning"
    assert warning.category == "import"
    assert (
        warning.message
        == "Module 'totally_missing_pkg' may not be available in sandbox"
    )


def test_validate_imports_respects_custom_available_set():
    result = validate_imports(
        "import alpha\nimport beta\nimport gamma",
        available={"alpha", "gamma"},
    )

    assert [w.message for w in result.warnings] == [
        "Module 'beta' may not be available in sandbox",
    ]


def test_validate_imports_returns_no_warnings_for_syntax_error_input():
    result = validate_imports("def bad(:\n    pass", available=set())

    assert result.ok is True
    assert result.warnings == []


@pytest.mark.parametrize("code", ["", "   \n\t  ", "# comment only"])
def test_validate_imports_handles_empty_like_inputs(code: str):
    result = validate_imports(code, available=set())

    assert result.ok is True
    assert result.warnings == []


def test_validate_code_combines_security_and_import_issues_in_order():
    code = 'import os\nos.system("echo hi")\nimport unknown_mod'
    result = validate_code(code, available_packages={"os"})

    assert result.ok is False
    assert [i.category for i in result.issues] == ["security", "import"]
    assert result.issues[0].message == "Dangerous call: os.system()"
    assert (
        result.issues[1].message
        == "Module 'unknown_mod' may not be available in sandbox"
    )


def test_validate_code_short_circuits_after_syntax_error():
    result = validate_code("def bad(:\n    pass")

    assert len(result.issues) == 1
    assert result.issues[0].category == "syntax"


def test_validate_code_skip_security_excludes_security_issues():
    code = 'import os\nos.system("echo hi")\nimport unknown_mod'
    result = validate_code(code, available_packages={"os"}, skip_security=True)

    assert [i.category for i in result.issues] == ["import"]


def test_validate_code_skip_imports_excludes_import_warnings():
    code = 'import os\nos.system("echo hi")\nimport unknown_mod'
    result = validate_code(code, available_packages={"os"}, skip_imports=True)

    assert all(issue.category == "security" for issue in result.issues)
    assert len(result.issues) == 1


def test_validate_code_skip_both_returns_clean_for_safe_code():
    result = validate_code("x = 1", skip_security=True, skip_imports=True)

    assert result.ok is True
    assert result.issues == []


def test_validate_code_uses_available_packages_for_import_validation():
    code = "import alpha\nimport beta"
    result = validate_code(code, available_packages={"alpha"})

    assert [i.message for i in result.issues] == [
        "Module 'beta' may not be available in sandbox",
    ]


def test_extract_imports_supports_import_and_from_import_styles():
    code = (
        "import os\nimport numpy as np\nfrom pandas import DataFrame\nfrom x.y import z"
    )

    assert extract_imports(code) == {"os", "numpy", "pandas", "x"}


def test_extract_imports_supports_multiple_aliases_and_dedupes():
    code = "import os.path, os, json as js\nfrom json import loads"

    assert extract_imports(code) == {"os", "json"}


def test_extract_imports_ignores_relative_import_without_module_name():
    assert extract_imports("from . import local_mod") == set()


def test_extract_imports_includes_relative_import_with_module_name():
    assert extract_imports("from ..pkg.sub import thing") == {"pkg"}


def test_extract_imports_returns_empty_set_for_syntax_error():
    assert extract_imports("def bad(:\n    pass") == set()


@pytest.mark.parametrize("code", ["", "   \n\t", "# comment only"])
def test_extract_imports_handles_empty_like_inputs(code: str):
    assert extract_imports(code) == set()


def test_format_issues_for_llm_returns_no_issues_message_when_clean():
    assert format_issues_for_llm(CodeValidation()) == "No issues found."


def test_format_issues_for_llm_formats_issues_with_and_without_line():
    validation = CodeValidation(
        issues=[
            ValidationIssue(
                severity="error",
                category="syntax",
                message="invalid syntax",
                line=3,
            ),
            ValidationIssue(
                severity="warning",
                category="import",
                message="Module 'x' may be missing",
                line=None,
            ),
        ]
    )

    formatted = format_issues_for_llm(validation)

    assert "- [ERROR] (syntax) invalid syntax @ line 3" in formatted
    assert (
        "- [WARNING] (import) Module 'x' may be missing @ unknown location" in formatted
    )


def test_format_issues_for_llm_preserves_issue_order():
    validation = CodeValidation(
        issues=[
            ValidationIssue(severity="warning", category="import", message="first"),
            ValidationIssue(
                severity="error", category="security", message="second", line=9
            ),
        ]
    )

    formatted = format_issues_for_llm(validation).splitlines()

    assert formatted[0] == "- [WARNING] (import) first @ unknown location"
    assert formatted[1] == "- [ERROR] (security) second @ line 9"


def test_code_validation_ok_true_when_no_errors_present():
    validation = CodeValidation(
        issues=[ValidationIssue(severity="warning", category="import", message="warn")]
    )

    assert validation.ok is True


def test_code_validation_ok_false_when_error_present():
    validation = CodeValidation(
        issues=[ValidationIssue(severity="error", category="syntax", message="bad")]
    )

    assert validation.ok is False


def test_code_validation_errors_and_warnings_filter_correctly():
    err = ValidationIssue(severity="error", category="security", message="danger")
    warn = ValidationIssue(
        severity="warning", category="import", message="maybe missing"
    )
    validation = CodeValidation(issues=[err, warn])

    assert validation.errors == [err]
    assert validation.warnings == [warn]


def test_code_validation_summary_for_no_issues():
    assert CodeValidation().summary() == "Code validation passed."


def test_code_validation_summary_for_errors_only():
    validation = CodeValidation(
        issues=[ValidationIssue(severity="error", category="syntax", message="bad")]
    )

    assert validation.summary() == "Code validation: 1 error(s)"


def test_code_validation_summary_for_warnings_only():
    validation = CodeValidation(
        issues=[ValidationIssue(severity="warning", category="import", message="warn")]
    )

    assert validation.summary() == "Code validation: 1 warning(s)"


def test_code_validation_summary_for_errors_and_warnings():
    validation = CodeValidation(
        issues=[
            ValidationIssue(severity="error", category="syntax", message="bad"),
            ValidationIssue(severity="warning", category="import", message="warn"),
        ]
    )

    assert validation.summary() == "Code validation: 1 error(s), 1 warning(s)"


# ---------------------------------------------------------------------------
# check_filename_collisions (BUG-202)
# ---------------------------------------------------------------------------


def test_filename_collision_detects_config_py():
    """BUG-202: config.py shadows pip 'config' package."""
    warnings = check_filename_collisions({"config.py": "x = 1", "main.py": "print(1)"})
    assert len(warnings) == 1
    assert "shadows stdlib/pip" in warnings[0]
    assert "config" in warnings[0]


def test_filename_collision_detects_stdlib_shadows():
    """Filenames shadowing stdlib modules should be flagged."""
    warnings = check_filename_collisions({"json.py": "x = 1"})
    assert len(warnings) == 1
    assert "json" in warnings[0]


def test_filename_collision_allows_safe_names():
    """Normal experiment filenames should not trigger warnings."""
    files = {
        "main.py": "print(1)",
        "models.py": "class M: pass",
        "training.py": "def train(): pass",
        "data_loader.py": "def load(): pass",
        "experiment_config.py": "LR = 0.01",
        "requirements.txt": "torch",
    }
    warnings = check_filename_collisions(files)
    assert warnings == []


def test_filename_collision_multiple_shadows():
    """Multiple shadowing files should each produce a warning."""
    files = {"config.py": "", "logging.py": "", "main.py": ""}
    warnings = check_filename_collisions(files)
    assert len(warnings) == 2


================================================
FILE: tests/test_results_table_builder.py
================================================
"""Tests for results_table_builder — pre-built LaTeX tables."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.pipeline.verified_registry import VerifiedRegistry
from researchclaw.templates.results_table_builder import (
    LatexTable,
    build_condition_whitelist,
    build_results_tables,
)

ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts"


def _make_registry(
    conditions: dict[str, dict[int, float]],
    primary_metric: float | None = None,
) -> VerifiedRegistry:
    """Create a registry from simple condition → {seed: value} mapping."""
    summary = {"best_run": {"metrics": {}}, "condition_summaries": {}, "metrics_summary": {}}
    for cond_name, seeds in conditions.items():
        for seed_idx, value in seeds.items():
            key = f"{cond_name}/{seed_idx}/metric"
            summary["best_run"]["metrics"][key] = value
        cond_metric = sum(seeds.values()) / len(seeds) if seeds else 0
        summary["condition_summaries"][cond_name] = {"metrics": {"metric": cond_metric}}
    if primary_metric is not None:
        summary["best_run"]["metrics"]["primary_metric"] = primary_metric
    return VerifiedRegistry.from_experiment(summary)


class TestBuildResultsTables:
    def test_basic_table(self):
        reg = _make_registry(
            {
                "Baseline": {0: 80.0, 1: 82.0, 2: 81.0},
                "Proposed": {0: 85.0, 1: 87.0, 2: 86.0},
            },
            primary_metric=86.0,
        )
        tables = build_results_tables(reg, metric_name="Accuracy (\\%)")
        assert len(tables) == 2  # main + per-seed
        main = tables[0]
        assert main.label == "tab:main_results"
        assert "AUTO-GENERATED" in main.latex_code
        assert "\\begin{table}" in main.latex_code
        assert "Baseline" in main.latex_code
        assert "Proposed" in main.latex_code
        assert main.n_conditions == 2

    def test_best_is_bolded(self):
        reg = _make_registry(
            {
                "Baseline": {0: 70.0, 1: 72.0},
                "Proposed": {0: 85.0, 1: 87.0},
            }
        )
        tables = build_results_tables(reg, metric_direction="maximize")
        main = tables[0]
        # Proposed should be bold (higher metric)
        assert "\\textbf" in main.latex_code

    def test_single_seed_marker(self):
        reg = _make_registry(
            {
                "Baseline": {0: 80.0, 1: 82.0},
                "Proposed": {0: 90.0},  # Single seed
            }
        )
        tables = build_results_tables(reg)
        main = tables[0]
        assert "\\ddagger" in main.latex_code  # Single-seed footnote

    def test_no_conditions(self):
        reg = VerifiedRegistry()
        tables = build_results_tables(reg)
        assert len(tables) == 0

    def test_all_single_seed_no_per_seed_table(self):
        reg = _make_registry(
            {
                "A": {0: 80.0},
                "B": {0: 70.0},
            }
        )
        tables = build_results_tables(reg)
        # Only 1 table (main), no per-seed table (all single seed)
        assert len(tables) == 1

    def test_per_seed_table_structure(self):
        reg = _make_registry(
            {
                "DQN": {0: 156.1, 1: 105.5, 2: 356.7},
                "DQN+Abstraction": {0: 98.1, 1: 456.7, 2: 282.0},
            }
        )
        tables = build_results_tables(reg)
        assert len(tables) == 2
        seed_table = tables[1]
        assert seed_table.label == "tab:per_seed"
        assert "156.10" in seed_table.latex_code or "156.1" in seed_table.latex_code
        assert "Seed 0" in seed_table.latex_code

    def test_two_column_uses_table_star(self):
        reg = _make_registry({"A": {0: 80.0, 1: 82.0}})
        tables = build_results_tables(reg, two_column=True)
        assert "\\begin{table*}" in tables[0].latex_code

    def test_verified_values_populated(self):
        reg = _make_registry(
            {"A": {0: 80.0, 1: 82.0}, "B": {0: 70.0, 1: 72.0}}
        )
        tables = build_results_tables(reg)
        main = tables[0]
        assert 81.0 in main.verified_values or any(
            abs(v - 81.0) < 0.01 for v in main.verified_values
        )

    def test_special_chars_escaped(self):
        reg = _make_registry({"DQN+Raw_Count": {0: 80.0, 1: 82.0}})
        tables = build_results_tables(reg)
        assert "DQN+Raw\\_Count" in tables[0].latex_code

    def test_minimize_direction(self):
        reg = _make_registry(
            {
                "Baseline": {0: 20.0, 1: 22.0},
                "Proposed": {0: 10.0, 1: 12.0},
            }
        )
        tables = build_results_tables(reg, metric_direction="minimize")
        # Proposed (lower) should be bold
        lines = tables[0].latex_code.split("\n")
        proposed_line = [l for l in lines if "Proposed" in l][0]
        assert "\\textbf" in proposed_line


class TestConditionWhitelist:
    def test_basic(self):
        reg = _make_registry(
            {
                "DQN": {0: 206.1, 1: 105.5, 2: 356.7},
                "DQN+Abstraction": {0: 278.93},
            }
        )
        wl = build_condition_whitelist(reg)
        assert "DQN" in wl
        assert "DQN+Abstraction" in wl
        assert "3 seed(s)" in wl
        assert "1 seed(s)" in wl

    def test_empty_registry(self):
        reg = VerifiedRegistry()
        wl = build_condition_whitelist(reg)
        assert "no conditions completed" in wl


class TestRealArtifacts:
    def _load(self, run_id: str) -> VerifiedRegistry:
        pattern = f"rc-*-{run_id}"
        matches = sorted(ARTIFACTS.glob(pattern))
        if not matches:
            pytest.skip(f"Artifact {run_id} not found")
        summary_path = matches[0] / "stage-14" / "experiment_summary.json"
        ref_path = matches[0] / "stage-13" / "refinement_log.json"
        if not summary_path.exists():
            pytest.skip(f"No experiment_summary for {run_id}")
        summary = json.loads(summary_path.read_text())
        ref_log = None
        if ref_path.exists():
            ref_log = json.loads(ref_path.read_text())
        return VerifiedRegistry.from_experiment(summary, ref_log)

    def test_run_e57360_rl_tables(self):
        reg = self._load("e57360")
        tables = build_results_tables(reg, metric_name="Return")
        assert len(tables) >= 1
        main = tables[0]
        # Should NOT contain PPO (never ran)
        assert "PPO" not in main.latex_code
        # Should contain DQN
        assert "DQN" in main.latex_code

    def test_run_acbdfa_tables(self):
        reg = self._load("acbdfa")
        tables = build_results_tables(reg, metric_name="Top-1 Accuracy (\\%)")
        assert len(tables) >= 1


================================================
FILE: tests/test_robotics_adapter.py
================================================
"""Tests for robotics & control domain adapter.

Covers adapter dispatch, prompt block generation, and integration
with the existing domain detection and profile system.
"""

from __future__ import annotations

import pytest

from researchclaw.domains.detector import (
    get_profile,
    _keyword_detect,
    _profile_cache,
)
from researchclaw.domains.prompt_adapter import (
    MLPromptAdapter,
    GenericPromptAdapter,
    get_adapter,
)


# ---------------------------------------------------------------------------
# Profile sanity
# ---------------------------------------------------------------------------


class TestRoboticsProfile:
    def setup_method(self):
        _profile_cache.clear()

    def test_profile_exists(self):
        profile = get_profile("robotics_control")
        assert profile is not None
        assert profile.domain_id == "robotics_control"

    def test_profile_fields(self):
        profile = get_profile("robotics_control")
        assert profile is not None
        assert profile.experiment_paradigm == "comparison"
        assert "gymnasium" in profile.core_libraries
        assert "stable-baselines3" in profile.core_libraries
        assert profile.gpu_required is True

    def test_profile_baselines(self):
        profile = get_profile("robotics_control")
        assert profile is not None
        baselines = profile.standard_baselines
        assert any("PPO" in b for b in baselines)
        assert any("SAC" in b for b in baselines)


# ---------------------------------------------------------------------------
# Keyword detection
# ---------------------------------------------------------------------------


class TestRoboticsKeywordDetection:
    def test_robot_keyword(self):
        assert _keyword_detect("robot manipulation task") == "robotics_control"

    def test_mujoco(self):
        assert _keyword_detect("locomotion in MuJoCo") == "robotics_control"

    def test_pybullet(self):
        assert _keyword_detect("grasping policy with PyBullet") == "robotics_control"


# ---------------------------------------------------------------------------
# Adapter dispatch
# ---------------------------------------------------------------------------


class TestRoboticsAdapter:
    def test_gets_robotics_adapter(self):
        profile = get_profile("robotics_control")
        if profile is None:
            pytest.skip("robotics_control profile not found")
        adapter = get_adapter(profile)
        assert not isinstance(adapter, MLPromptAdapter)
        # Before this contribution it would fall back to GenericPromptAdapter
        from researchclaw.domains.adapters.robotics import (
            RoboticsPromptAdapter,
        )
        assert isinstance(adapter, RoboticsPromptAdapter)

    def test_code_generation_blocks_nonempty(self):
        profile = get_profile("robotics_control")
        if profile is None:
            pytest.skip("robotics_control profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints
        assert blocks.dataset_guidance
        assert blocks.output_format_guidance

    def test_experiment_design_mentions_baselines(self):
        profile = get_profile("robotics_control")
        if profile is None:
            pytest.skip("robotics_control profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_experiment_design_blocks({})
        assert "PPO" in blocks.experiment_design_context
        assert "SAC" in blocks.experiment_design_context

    def test_result_analysis_mentions_return(self):
        profile = get_profile("robotics_control")
        if profile is None:
            pytest.skip("robotics_control profile not found")
        adapter = get_adapter(profile)
        blocks = adapter.get_result_analysis_blocks({})
        assert "return" in blocks.result_analysis_hints.lower()

    def test_blueprint_context(self):
        profile = get_profile("robotics_control")
        if profile is None:
            pytest.skip("robotics_control profile not found")
        adapter = get_adapter(profile)
        ctx = adapter.get_blueprint_context()
        if profile.typical_file_structure:
            assert "agent.py" in ctx or "train.py" in ctx


================================================
FILE: tests/test_servers.py
================================================
"""Tests for multi-server resource scheduling (C2): Registry, Monitor, Dispatcher, Executors."""

from __future__ import annotations

import asyncio
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from researchclaw.servers.registry import ServerEntry, ServerRegistry
from researchclaw.servers.monitor import ServerMonitor, _parse_status_output
from researchclaw.servers.dispatcher import TaskDispatcher
from researchclaw.servers.ssh_executor import SSHExecutor
from researchclaw.servers.slurm_executor import SlurmExecutor
from researchclaw.servers.cloud_executor import CloudExecutor


# ── fixtures ──────────────────────────────────────────────────────


def _make_server(
    name: str = "s1",
    host: str = "gpu1.local",
    server_type: str = "ssh",
    vram_gb: int = 24,
    priority: int = 1,
    cost: float = 0.0,
    scheduler: str = "",
    cloud_provider: str = "",
) -> ServerEntry:
    return ServerEntry(
        name=name,
        host=host,
        server_type=server_type,
        gpu="RTX 4090",
        vram_gb=vram_gb,
        priority=priority,
        cost_per_hour=cost,
        scheduler=scheduler,
        cloud_provider=cloud_provider,
    )


@pytest.fixture
def registry() -> ServerRegistry:
    return ServerRegistry([
        _make_server("local", "localhost", vram_gb=48, priority=1),
        _make_server("cloud1", "cloud.host", server_type="cloud", vram_gb=80, priority=3, cost=2.0, cloud_provider="aws"),
        _make_server("hpc", "hpc.host", server_type="slurm", vram_gb=40, priority=2, scheduler="slurm"),
    ])


# ══════════════════════════════════════════════════════════════════
# ServerEntry tests
# ══════════════════════════════════════════════════════════════════


class TestServerEntry:
    def test_to_dict_roundtrip(self) -> None:
        s = _make_server()
        d = s.to_dict()
        s2 = ServerEntry.from_dict(d)
        assert s2.name == s.name
        assert s2.vram_gb == s.vram_gb

    def test_defaults(self) -> None:
        s = ServerEntry.from_dict({"name": "x"})
        assert s.server_type == "ssh"
        assert s.priority == 1


# ══════════════════════════════════════════════════════════════════
# ServerRegistry tests
# ══════════════════════════════════════════════════════════════════


class TestServerRegistry:
    def test_list_all_sorted_by_priority(self, registry: ServerRegistry) -> None:
        servers = registry.list_all()
        priorities = [s.priority for s in servers]
        assert priorities == sorted(priorities)

    def test_count(self, registry: ServerRegistry) -> None:
        assert registry.count == 3

    def test_add_server(self) -> None:
        reg = ServerRegistry()
        reg.add(_make_server("new"))
        assert reg.count == 1
        assert reg.get("new").name == "new"

    def test_remove_server(self, registry: ServerRegistry) -> None:
        registry.remove("local")
        assert registry.count == 2

    def test_remove_unknown_raises(self, registry: ServerRegistry) -> None:
        with pytest.raises(KeyError):
            registry.remove("ghost")

    def test_get_unknown_raises(self, registry: ServerRegistry) -> None:
        with pytest.raises(KeyError):
            registry.get("ghost")

    def test_get_available_excludes(self, registry: ServerRegistry) -> None:
        avail = registry.get_available(exclude={"local"})
        names = [s.name for s in avail]
        assert "local" not in names
        assert len(names) == 2

    def test_get_best_match_by_vram(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match({"min_vram_gb": 40})
        assert best is not None
        assert best.vram_gb >= 40

    def test_get_best_match_by_type(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match({"server_type": "slurm"})
        assert best is not None
        assert best.server_type == "slurm"

    def test_get_best_match_prefers_free(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match(prefer_free=True)
        assert best is not None
        assert best.cost_per_hour == 0.0

    def test_get_best_match_none_when_impossible(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match({"min_vram_gb": 999})
        assert best is None

    def test_get_best_match_by_gpu(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match({"gpu": "RTX"})
        assert best is not None

    def test_get_best_match_no_requirements(self, registry: ServerRegistry) -> None:
        best = registry.get_best_match()
        assert best is not None
        assert best.name == "local"


# ══════════════════════════════════════════════════════════════════
# ServerMonitor tests
# ══════════════════════════════════════════════════════════════════


class TestServerMonitor:
    def test_parse_status_output(self) -> None:
        raw = "75, 8000, 24576\n---\n              total        used        free\nMem:          64000       32000       32000\n---\n 10:00:00 up 5 days"
        server = _make_server()
        status = _parse_status_output(raw, server)
        assert status["gpu"]["count"] == 1
        assert status["gpu"]["devices"][0]["utilization_pct"] == 75
        assert status["memory"]["total_mb"] == 64000
        assert "uptime" in status

    def test_parse_status_no_gpu(self) -> None:
        raw = "\n---\n              total        used        free\nMem:          64000       32000       32000\n---\nup 1 day"
        server = _make_server()
        status = _parse_status_output(raw, server)
        assert status["gpu"]["count"] == 0

    def test_get_cached_none(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        assert monitor.get_cached("local") is None

    def test_get_gpu_usage_empty(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        assert monitor.get_gpu_usage(_make_server()) == {}

    def test_check_status_unreachable(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        with patch("researchclaw.servers.monitor._ssh_command", side_effect=RuntimeError("unreachable")):
            status = asyncio.run(monitor.check_status(_make_server()))
        assert status["reachable"] is False

    def test_check_all(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        with patch("researchclaw.servers.monitor._ssh_command", side_effect=RuntimeError("unreachable")):
            results = asyncio.run(monitor.check_all())
        assert len(results) == 3
        for name, status in results.items():
            assert status["reachable"] is False


# ══════════════════════════════════════════════════════════════════
# SSHExecutor tests
# ══════════════════════════════════════════════════════════════════


class TestSSHExecutor:
    def test_init(self) -> None:
        server = _make_server()
        exe = SSHExecutor(server)
        assert exe.host == "gpu1.local"

    def test_run_experiment_timeout(self) -> None:
        server = _make_server()
        exe = SSHExecutor(server)

        async def _run() -> dict:
            with patch("asyncio.create_subprocess_exec") as mock_exec:
                proc = AsyncMock()
                proc.communicate = AsyncMock(side_effect=asyncio.TimeoutError)
                proc.kill = AsyncMock()
                proc.wait = AsyncMock()
                mock_exec.return_value = proc
                return await exe.run_experiment("/tmp/test", "echo hello", timeout=1)

        result = asyncio.run(_run())
        assert result["success"] is False
        assert "Timeout" in result["error"]


# ══════════════════════════════════════════════════════════════════
# SlurmExecutor tests
# ══════════════════════════════════════════════════════════════════


class TestSlurmExecutor:
    def test_init_wrong_type_raises(self) -> None:
        server = _make_server(server_type="ssh")
        with pytest.raises(ValueError, match="not a slurm"):
            SlurmExecutor(server)

    def test_generate_sbatch_script(self) -> None:
        server = _make_server(server_type="slurm", scheduler="slurm")
        exe = SlurmExecutor(server)
        script = exe._generate_sbatch_script("python main.py", resources={"gpus": 2, "mem_gb": 32})
        assert "#SBATCH --gres=gpu:2" in script
        assert "#SBATCH --mem=32G" in script
        assert "python main.py" in script

    def test_sbatch_script_default_resources(self) -> None:
        server = _make_server(server_type="slurm", scheduler="slurm")
        exe = SlurmExecutor(server)
        script = exe._generate_sbatch_script("echo hi")
        assert "#SBATCH --gres=gpu:1" in script
        assert "#SBATCH --time=01:00:00" in script

    def test_submit_job_parses_output(self) -> None:
        server = _make_server(server_type="slurm", scheduler="slurm")
        exe = SlurmExecutor(server)

        async def _run() -> str:
            with patch("asyncio.create_subprocess_exec") as mock_exec:
                proc = AsyncMock()
                proc.communicate = AsyncMock(return_value=(b"Submitted batch job 12345\n", b""))
                proc.returncode = 0
                mock_exec.return_value = proc
                return await exe.submit_job("echo hi", "/tmp/test")

        job_id = asyncio.run(_run())
        assert job_id == "12345"


# ══════════════════════════════════════════════════════════════════
# CloudExecutor tests
# ══════════════════════════════════════════════════════════════════


class TestCloudExecutor:
    def test_init_wrong_type_raises(self) -> None:
        server = _make_server(server_type="ssh")
        with pytest.raises(ValueError, match="not a cloud"):
            CloudExecutor(server)

    def test_launch_instance_stub(self) -> None:
        server = _make_server(server_type="cloud", cloud_provider="aws")
        exe = CloudExecutor(server)
        result = asyncio.run(exe.launch_instance())
        assert result["status"] == "stub_launched"
        assert result["provider"] == "aws"


# ══════════════════════════════════════════════════════════════════
# TaskDispatcher tests
# ══════════════════════════════════════════════════════════════════


class TestTaskDispatcher:
    def test_dispatch_returns_task_id(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        disp = TaskDispatcher(registry, monitor)
        task_id = asyncio.run(disp.dispatch({"command": "echo hi", "local_dir": "/tmp"}))
        assert len(task_id) == 12

    def test_dispatch_no_server_queues(self) -> None:
        reg = ServerRegistry()
        monitor = ServerMonitor(reg)
        disp = TaskDispatcher(reg, monitor)
        task_id = asyncio.run(disp.dispatch({"command": "echo hi"}))
        status = disp.get_task_status(task_id)
        assert status["status"] == "queued"

    def test_get_task_status_unknown(self, registry: ServerRegistry) -> None:
        monitor = ServerMonitor(registry)
        disp = TaskDispatcher(registry, monitor)
        status = disp.get_task_status("nonexistent")
        assert status["status"] == "unknown"


================================================
FILE: tests/test_skills_library.py
================================================
"""Tests for the dynamic skills library.

Covers:
- Skill schema (agentskills.io data model)
- YAML skill loading (legacy)
- SKILL.md loading (agentskills.io)
- Skill registry (register, query, external dirs)
- Keyword matching + description fallback
- Stage filtering (int + string)
- Prompt formatting
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from researchclaw.skills.schema import Skill, STAGE_NAME_TO_NUMBER
from researchclaw.skills.loader import (
    load_skill_file,
    load_skill_from_skillmd,
    load_skillmd_from_directory,
    load_skills_from_directory,
)
from researchclaw.skills.registry import SkillRegistry
from researchclaw.skills.matcher import (
    match_skills,
    format_skills_for_prompt,
    _tokenize,
    _resolve_stage,
)


# ── Fixtures ─────────────────────────────────────────────────────────


@pytest.fixture
def sample_skill() -> Skill:
    return Skill(
        name="test-skill-1",
        description="A test skill for unit testing",
        body="## Test Skill\nDo the thing.",
        metadata={
            "category": "tooling",
            "trigger-keywords": "training,pytorch,gpu",
            "applicable-stages": "10,12",
            "priority": "5",
            "version": "1.0",
            "code-template": "print('hello')",
            "references": "Test Paper 2024",
        },
    )


@pytest.fixture
def skill_yaml_dir(tmp_path: Path) -> Path:
    d = tmp_path / "skills"
    d.mkdir()
    skill_data = {
        "id": "yaml-skill-1",
        "name": "YAML Test Skill",
        "category": "experiment",
        "description": "Loaded from YAML",
        "trigger_keywords": ["review", "literature"],
        "applicable_stages": [3, 4, 5],
        "prompt_template": "Do literature review",
        "version": "1.0",
        "priority": 3,
    }
    import yaml
    (d / "test_skill.yaml").write_text(yaml.dump(skill_data), encoding="utf-8")
    return d


@pytest.fixture
def skill_json_dir(tmp_path: Path) -> Path:
    d = tmp_path / "json_skills"
    d.mkdir()
    skill_data = {
        "id": "json-skill-1",
        "name": "JSON Test Skill",
        "category": "writing",
        "description": "Loaded from JSON",
        "trigger_keywords": ["paper", "writing"],
        "applicable_stages": [17],
        "prompt_template": "Write well",
        "version": "1.0",
        "priority": 4,
    }
    (d / "test_skill.json").write_text(
        json.dumps(skill_data), encoding="utf-8"
    )
    return d


@pytest.fixture
def skillmd_dir(tmp_path: Path) -> Path:
    """Create a directory with SKILL.md files for testing."""
    d = tmp_path / "skillmd_skills"
    d.mkdir()
    # Skill with full metadata
    s1 = d / "test-skill-md"
    s1.mkdir()
    (s1 / "SKILL.md").write_text(
        "---\n"
        "name: test-skill-md\n"
        "description: A test skill from SKILL.md\n"
        "metadata:\n"
        "  category: domain\n"
        "  trigger-keywords: \"nlp,transformer,bert\"\n"
        "  applicable-stages: \"9,10\"\n"
        "  priority: \"2\"\n"
        "---\n\n"
        "## NLP Skill\nDo NLP things.\n",
        encoding="utf-8",
    )
    # Skill with minimal metadata (no trigger-keywords)
    s2 = d / "minimal-skill"
    s2.mkdir()
    (s2 / "SKILL.md").write_text(
        "---\n"
        "name: minimal-skill\n"
        "description: A minimal skill for testing description-based matching\n"
        "---\n\n"
        "## Minimal\nJust a body.\n",
        encoding="utf-8",
    )
    return d


@pytest.fixture
def external_skillmd_dir(tmp_path: Path) -> Path:
    """Simulates an external skill directory (like Collider-Agent)."""
    d = tmp_path / "external"
    d.mkdir()
    s = d / "hep-feynrules"
    s.mkdir()
    (s / "SKILL.md").write_text(
        "---\n"
        "name: hep-feynrules\n"
        "description: Generate FeynRules model files for BSM physics\n"
        "metadata:\n"
        "  category: domain\n"
        "  applicable-stages: \"10\"\n"
        "---\n\n"
        "## FeynRules Model Generation\n"
        "Build BSM model files for MadGraph.\n",
        encoding="utf-8",
    )
    return d


# ── Skill Schema ─────────────────────────────────────────────────────


class TestSkillSchema:
    def test_create_skill(self, sample_skill: Skill) -> None:
        assert sample_skill.name == "test-skill-1"
        assert sample_skill.id == "test-skill-1"  # backward compat
        assert sample_skill.category == "tooling"
        assert len(sample_skill.trigger_keywords) == 3

    def test_to_dict(self, sample_skill: Skill) -> None:
        d = sample_skill.to_dict()
        assert d["id"] == "test-skill-1"
        assert d["applicable_stages"] == [10, 12]
        assert d["code_template"] == "print('hello')"

    def test_from_dict(self) -> None:
        data = {
            "id": "from-dict",
            "name": "From Dict",
            "category": "domain",
            "description": "Created from dict",
            "trigger_keywords": ["test"],
            "applicable_stages": [1],
            "prompt_template": "test prompt",
        }
        skill = Skill.from_dict(data)
        assert skill.name == "from-dict"
        assert skill.priority == 5  # default

    def test_from_dict_defaults(self) -> None:
        skill = Skill.from_dict({})
        assert skill.name == ""
        assert skill.version == "1.0"
        assert skill.code_template is None

    def test_roundtrip(self, sample_skill: Skill) -> None:
        d = sample_skill.to_dict()
        restored = Skill.from_dict(d)
        assert restored.name == sample_skill.name
        assert restored.applicable_stages == sample_skill.applicable_stages

    def test_stage_name_to_number(self) -> None:
        assert STAGE_NAME_TO_NUMBER["code_generation"] == 10
        assert STAGE_NAME_TO_NUMBER["paper_draft"] == 17
        assert len(STAGE_NAME_TO_NUMBER) == 23

    def test_prompt_template_alias(self, sample_skill: Skill) -> None:
        assert sample_skill.prompt_template == sample_skill.body


# ── Skill Loader ─────────────────────────────────────────────────────


class TestSkillLoader:
    def test_load_yaml(self, skill_yaml_dir: Path) -> None:
        skill = load_skill_file(skill_yaml_dir / "test_skill.yaml")
        assert skill is not None
        assert skill.name == "yaml-skill-1"
        assert skill.category == "experiment"

    def test_load_json(self, skill_json_dir: Path) -> None:
        skill = load_skill_file(skill_json_dir / "test_skill.json")
        assert skill is not None
        assert skill.name == "json-skill-1"

    def test_load_nonexistent(self, tmp_path: Path) -> None:
        skill = load_skill_file(tmp_path / "nope.yaml")
        assert skill is None

    def test_load_invalid_yaml(self, tmp_path: Path) -> None:
        bad = tmp_path / "bad.yaml"
        bad.write_text("not: [valid: yaml: {", encoding="utf-8")
        skill = load_skill_file(bad)
        assert skill is None

    def test_load_unsupported_format(self, tmp_path: Path) -> None:
        txt = tmp_path / "skill.txt"
        txt.write_text("id: test", encoding="utf-8")
        skill = load_skill_file(txt)
        assert skill is None

    def test_load_directory(self, skill_yaml_dir: Path) -> None:
        skills = load_skills_from_directory(skill_yaml_dir)
        assert len(skills) == 1

    def test_load_empty_directory(self, tmp_path: Path) -> None:
        empty = tmp_path / "empty"
        empty.mkdir()
        skills = load_skills_from_directory(empty)
        assert skills == []

    def test_load_missing_directory(self, tmp_path: Path) -> None:
        skills = load_skills_from_directory(tmp_path / "nonexistent")
        assert skills == []


class TestSkillMdLoader:
    def test_load_skillmd(self, skillmd_dir: Path) -> None:
        skill = load_skill_from_skillmd(skillmd_dir / "test-skill-md" / "SKILL.md")
        assert skill is not None
        assert skill.name == "test-skill-md"
        assert skill.category == "domain"
        assert "nlp" in skill.trigger_keywords
        assert skill.applicable_stages == [9, 10]
        assert skill.priority == 2
        assert "NLP Skill" in skill.body
        assert skill.source_format == "skillmd"

    def test_load_skillmd_minimal(self, skillmd_dir: Path) -> None:
        skill = load_skill_from_skillmd(skillmd_dir / "minimal-skill" / "SKILL.md")
        assert skill is not None
        assert skill.name == "minimal-skill"
        assert skill.trigger_keywords == []
        assert skill.applicable_stages == []
        assert skill.priority == 5  # default

    def test_load_skillmd_missing(self, tmp_path: Path) -> None:
        skill = load_skill_from_skillmd(tmp_path / "nope" / "SKILL.md")
        assert skill is None

    def test_load_skillmd_no_frontmatter(self, tmp_path: Path) -> None:
        d = tmp_path / "bad-skill"
        d.mkdir()
        (d / "SKILL.md").write_text("No frontmatter here", encoding="utf-8")
        skill = load_skill_from_skillmd(d / "SKILL.md")
        assert skill is None

    def test_load_skillmd_directory(self, skillmd_dir: Path) -> None:
        skills = load_skillmd_from_directory(skillmd_dir)
        assert len(skills) == 2
        names = {s.name for s in skills}
        assert "test-skill-md" in names
        assert "minimal-skill" in names

    def test_skillmd_wins_over_yaml(self, tmp_path: Path) -> None:
        """When both SKILL.md and YAML exist for the same name, SKILL.md wins."""
        d = tmp_path / "mixed"
        d.mkdir()
        # YAML file
        import yaml
        (d / "test-skill-md.yaml").write_text(
            yaml.dump({
                "id": "test-skill-md",
                "name": "test-skill-md",
                "category": "tooling",
                "description": "From YAML",
                "trigger_keywords": ["x"],
                "applicable_stages": [1],
                "prompt_template": "yaml body",
            }),
            encoding="utf-8",
        )
        # SKILL.md file
        sd = d / "test-skill-md"
        sd.mkdir()
        (sd / "SKILL.md").write_text(
            "---\nname: test-skill-md\ndescription: From SKILL.md\n---\n\nskillmd body\n",
            encoding="utf-8",
        )
        skills = load_skills_from_directory(d)
        matched = [s for s in skills if s.name == "test-skill-md"]
        assert len(matched) == 1
        assert matched[0].source_format == "skillmd"
        assert "From SKILL.md" in matched[0].description


# ── Matcher ──────────────────────────────────────────────────────────


class TestMatcher:
    def test_tokenize(self) -> None:
        tokens = _tokenize("PyTorch Training GPU")
        assert "pytorch" in tokens
        assert "training" in tokens
        assert "gpu" in tokens

    def test_match_by_keyword(self, sample_skill: Skill) -> None:
        matched = match_skills(
            [sample_skill],
            context="training a pytorch model on gpu",
            stage=10,
        )
        assert len(matched) == 1
        assert matched[0].name == "test-skill-1"

    def test_match_filters_by_stage(self, sample_skill: Skill) -> None:
        matched = match_skills(
            [sample_skill],
            context="training pytorch gpu",
            stage=1,  # not in applicable_stages
        )
        assert len(matched) == 0

    def test_match_empty_context(self, sample_skill: Skill) -> None:
        matched = match_skills([sample_skill], context="", stage=10)
        assert len(matched) == 0

    def test_match_no_keyword_overlap(self, sample_skill: Skill) -> None:
        matched = match_skills(
            [sample_skill],
            context="linguistics morphology",
            stage=10,
        )
        assert len(matched) == 0

    def test_match_respects_top_k(self) -> None:
        skills = [
            Skill(
                name=f"skill-{i}",
                description="test",
                body="test",
                metadata={
                    "category": "tooling",
                    "trigger-keywords": "training",
                    "applicable-stages": "10",
                    "priority": str(i),
                },
            )
            for i in range(10)
        ]
        matched = match_skills(skills, context="training", stage=10, top_k=3)
        assert len(matched) == 3

    def test_match_priority_ordering(self) -> None:
        high = Skill(
            name="high", description="t", body="t",
            metadata={
                "trigger-keywords": "training",
                "applicable-stages": "10",
                "priority": "1",
            },
        )
        low = Skill(
            name="low", description="t", body="t",
            metadata={
                "trigger-keywords": "training",
                "applicable-stages": "10",
                "priority": "9",
            },
        )
        matched = match_skills([low, high], context="training", stage=10)
        assert matched[0].name == "high"

    def test_match_string_stage(self, sample_skill: Skill) -> None:
        """String stage names should be resolved via STAGE_NAME_TO_NUMBER."""
        matched = match_skills(
            [sample_skill],
            context="training pytorch gpu",
            stage="code_generation",  # resolves to 10
        )
        assert len(matched) == 1
        assert matched[0].name == "test-skill-1"

    def test_match_string_stage_mismatch(self, sample_skill: Skill) -> None:
        matched = match_skills(
            [sample_skill],
            context="training pytorch gpu",
            stage="paper_draft",  # resolves to 17, not in [10, 12]
        )
        assert len(matched) == 0

    def test_resolve_stage(self) -> None:
        assert _resolve_stage(10) == 10
        assert _resolve_stage("code_generation") == 10
        assert _resolve_stage("unknown_stage") == -1

    def test_match_description_fallback(self) -> None:
        """Skills without trigger_keywords should match via description."""
        external_skill = Skill(
            name="ext-skill",
            description="Generate FeynRules model files for BSM physics",
            body="Do feynrules things.",
            metadata={"applicable-stages": "10"},
        )
        matched = match_skills(
            [external_skill],
            context="feynrules model generation",
            stage=10,
            fallback_matching=True,
        )
        assert len(matched) == 1
        assert matched[0].name == "ext-skill"

    def test_match_description_fallback_disabled(self) -> None:
        external_skill = Skill(
            name="ext-skill",
            description="Generate FeynRules model files for BSM physics",
            body="Do feynrules things.",
            metadata={"applicable-stages": "10"},
        )
        matched = match_skills(
            [external_skill],
            context="feynrules model generation",
            stage=10,
            fallback_matching=False,
        )
        assert len(matched) == 0


class TestFormatSkills:
    def test_format_single_skill(self, sample_skill: Skill) -> None:
        text = format_skills_for_prompt([sample_skill])
        assert "test-skill-1" in text
        assert "tooling" in text

    def test_format_empty(self) -> None:
        assert format_skills_for_prompt([]) == ""

    def test_format_includes_code_template(self, sample_skill: Skill) -> None:
        text = format_skills_for_prompt([sample_skill])
        assert "print('hello')" in text

    def test_format_includes_references(self, sample_skill: Skill) -> None:
        text = format_skills_for_prompt([sample_skill])
        assert "Test Paper 2024" in text

    def test_format_respects_max_chars(self) -> None:
        skills = [
            Skill(
                name=f"s{i}", description="t", body="x" * 500,
                metadata={
                    "category": "tooling",
                    "trigger-keywords": "t",
                },
            )
            for i in range(10)
        ]
        text = format_skills_for_prompt(skills, max_chars=1000)
        assert len(text) <= 1500  # some slack for headers


# ── Registry ─────────────────────────────────────────────────────────


class TestSkillRegistry:
    def test_registry_loads_builtins(self) -> None:
        registry = SkillRegistry()
        assert registry.count() >= 12  # builtin skills (SKILL.md format)

    def test_builtin_skillmd_count(self) -> None:
        """All builtin skills should load from SKILL.md."""
        registry = SkillRegistry()
        assert registry.count() == 16

    def test_register_custom(self, sample_skill: Skill) -> None:
        registry = SkillRegistry()
        initial = registry.count()
        registry.register(sample_skill)
        assert registry.count() == initial + 1

    def test_get_skill(self, sample_skill: Skill) -> None:
        registry = SkillRegistry()
        registry.register(sample_skill)
        got = registry.get("test-skill-1")
        assert got is not None
        assert got.name == "test-skill-1"

    def test_get_nonexistent(self) -> None:
        registry = SkillRegistry()
        assert registry.get("nonexistent") is None

    def test_unregister(self, sample_skill: Skill) -> None:
        registry = SkillRegistry()
        registry.register(sample_skill)
        assert registry.unregister("test-skill-1")
        assert registry.get("test-skill-1") is None

    def test_unregister_nonexistent(self) -> None:
        registry = SkillRegistry()
        assert not registry.unregister("nope")

    def test_list_by_category(self) -> None:
        registry = SkillRegistry()
        tooling = registry.list_by_category("tooling")
        assert len(tooling) > 0
        assert all(s.category == "tooling" for s in tooling)

    def test_list_by_stage(self) -> None:
        registry = SkillRegistry()
        stage_10 = registry.list_by_stage(10)
        assert len(stage_10) > 0

    def test_match(self) -> None:
        registry = SkillRegistry()
        matched = registry.match("pytorch training classification cifar", stage=10)
        assert len(matched) > 0

    def test_match_string_stage(self) -> None:
        registry = SkillRegistry()
        matched = registry.match(
            "pytorch training classification",
            stage="code_generation",
        )
        assert len(matched) > 0

    def test_export_for_prompt(self) -> None:
        registry = SkillRegistry()
        matched = registry.match("pytorch training", stage=10, top_k=2)
        text = registry.export_for_prompt(matched)
        assert len(text) > 0

    def test_custom_dir_loading(self, skill_yaml_dir: Path) -> None:
        registry = SkillRegistry(custom_dirs=[str(skill_yaml_dir)])
        skill = registry.get("yaml-skill-1")
        assert skill is not None

    def test_registry_external_dirs(self, external_skillmd_dir: Path) -> None:
        registry = SkillRegistry(external_dirs=[str(external_skillmd_dir)])
        assert registry.count() == 17  # 16 builtin + 1 external
        skill = registry.get("hep-feynrules")
        assert skill is not None
        assert skill.category == "domain"

    def test_registry_external_match_fallback(
        self, external_skillmd_dir: Path
    ) -> None:
        """External skills without trigger_keywords should match via description."""
        registry = SkillRegistry(
            external_dirs=[str(external_skillmd_dir)],
            fallback_matching=True,
        )
        matched = registry.match("feynrules model generation", stage=10, top_k=10)
        names = [s.name for s in matched]
        assert "hep-feynrules" in names


================================================
FILE: tests/test_ssh_and_colab_sandbox.py
================================================
# pyright: reportPrivateUsage=false, reportUnknownParameterType=false
"""Tests for ssh_remote and colab_drive experiment backends."""
from __future__ import annotations

import json
import textwrap
import time
from pathlib import Path
from unittest import mock

import pytest

from researchclaw.config import (
    ColabDriveConfig,
    ExperimentConfig,
    SandboxConfig,
    SshRemoteConfig,
    DockerSandboxConfig,
    CodeAgentConfig,
    BenchmarkAgentConfig,
    FigureAgentConfig,
)
from researchclaw.experiment.ssh_sandbox import (
    SshRemoteSandbox,
    _build_ssh_base,
    _ssh_target,
)
from researchclaw.experiment.colab_sandbox import (
    ColabDriveSandbox,
    COLAB_WORKER_TEMPLATE,
)
from researchclaw.experiment.factory import create_sandbox


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _make_experiment_config(**overrides) -> ExperimentConfig:
    defaults = dict(
        sandbox=SandboxConfig(),
        docker=DockerSandboxConfig(),
        ssh_remote=SshRemoteConfig(),
        colab_drive=ColabDriveConfig(),
        code_agent=CodeAgentConfig(),
        benchmark_agent=BenchmarkAgentConfig(),
        figure_agent=FigureAgentConfig(),
    )
    defaults.update(overrides)
    return ExperimentConfig(**defaults)


# ===========================================================================
# SSH Remote: unit tests
# ===========================================================================

class TestSshTarget:
    def test_with_user(self):
        cfg = SshRemoteConfig(host="gpu.lab.edu", user="alice")
        assert _ssh_target(cfg) == "alice@gpu.lab.edu"

    def test_without_user(self):
        cfg = SshRemoteConfig(host="gpu.lab.edu")
        assert _ssh_target(cfg) == "gpu.lab.edu"


class TestBuildSshBase:
    def test_default_port(self):
        cfg = SshRemoteConfig(host="server", user="bob")
        cmd = _build_ssh_base(cfg)
        assert "ssh" in cmd
        assert "bob@server" in cmd
        assert "-p" not in cmd

    def test_custom_port(self):
        cfg = SshRemoteConfig(host="server", user="bob", port=2222)
        cmd = _build_ssh_base(cfg)
        idx = cmd.index("-p")
        assert cmd[idx + 1] == "2222"

    def test_key_path(self):
        cfg = SshRemoteConfig(host="server", key_path="~/.ssh/my_key")
        cmd = _build_ssh_base(cfg)
        assert "-i" in cmd


class TestSshRemoteSandboxCommands:
    def test_bare_exec_cmd(self, tmp_path: Path):
        cfg = SshRemoteConfig(
            host="server", user="test", gpu_ids=(0, 1),
            remote_python="python3",
        )
        sb = SshRemoteSandbox(cfg, tmp_path)
        cmd = sb._build_bare_exec_cmd("/tmp/rc-test", entry_point="main.py")
        assert "CUDA_VISIBLE_DEVICES=0,1" in cmd
        assert "HOME=/tmp/rc-test" in cmd
        assert "python3 -u main.py" in cmd
        assert "unshare --net" in cmd

    def test_bare_exec_no_gpu(self, tmp_path: Path):
        cfg = SshRemoteConfig(host="server", user="test")
        sb = SshRemoteSandbox(cfg, tmp_path)
        cmd = sb._build_bare_exec_cmd("/tmp/rc-test", entry_point="main.py")
        assert "CUDA_VISIBLE_DEVICES" not in cmd

    def test_docker_exec_cmd(self, tmp_path: Path):
        cfg = SshRemoteConfig(
            host="server", user="test",
            use_docker=True,
            docker_image="myimage:latest",
            docker_network_policy="none",
            docker_memory_limit_mb=4096,
            docker_shm_size_mb=1024,
            gpu_ids=(0,),
        )
        sb = SshRemoteSandbox(cfg, tmp_path)
        cmd = sb._build_docker_exec_cmd("/tmp/rc-test", entry_point="main.py")
        assert "docker run --rm" in cmd
        assert "-v /tmp/rc-test:/workspace" in cmd
        assert "--network none" in cmd
        assert "--memory=4096m" in cmd
        assert "--shm-size=1024m" in cmd
        assert "device=0" in cmd
        assert "myimage:latest" in cmd
        assert cmd.endswith("main.py")

    def test_docker_exec_full_network(self, tmp_path: Path):
        cfg = SshRemoteConfig(
            host="server", use_docker=True,
            docker_network_policy="full",
        )
        sb = SshRemoteSandbox(cfg, tmp_path)
        cmd = sb._build_docker_exec_cmd("/tmp/rc-test", entry_point="main.py")
        assert "--network" not in cmd


# ── Entry point path traversal validation ─────────────────────────────


class TestSshEntryPointValidation:
    def test_run_project_rejects_path_traversal(self, tmp_path: Path):
        """run_project() must reject entry_point with '..' components."""
        project = tmp_path / "proj"
        project.mkdir()
        (project / "main.py").write_text("print('hi')")

        cfg = SshRemoteConfig(host="server", user="test")
        work = tmp_path / "work"
        sandbox = SshRemoteSandbox(cfg, work)
        # Create escape target so .exists() alone wouldn't catch it
        work.mkdir(parents=True, exist_ok=True)
        (work / "escape.py").write_text("print('escaped!')")
        # Mock _execute to ensure it's never reached
        sandbox._execute = mock.MagicMock()  # type: ignore[assignment]
        result = sandbox.run_project(project, entry_point="../escape.py")

        assert result.returncode == -1
        assert ".." in result.stderr
        sandbox._execute.assert_not_called()

    def test_run_project_rejects_absolute_path(self, tmp_path: Path):
        """run_project() must reject absolute entry_point paths."""
        project = tmp_path / "proj"
        project.mkdir()
        (project / "main.py").write_text("print('hi')")

        cfg = SshRemoteConfig(host="server", user="test")
        sandbox = SshRemoteSandbox(cfg, tmp_path / "work")
        sandbox._execute = mock.MagicMock()  # type: ignore[assignment]
        result = sandbox.run_project(project, entry_point="/etc/passwd")

        assert result.returncode == -1
        assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower()
        sandbox._execute.assert_not_called()


class TestSshConnectivityCheck:
    def test_empty_host(self):
        cfg = SshRemoteConfig(host="")
        ok, msg = SshRemoteSandbox.check_ssh_available(cfg)
        assert not ok
        assert "empty" in msg

    def test_unreachable_host(self):
        cfg = SshRemoteConfig(host="nonexistent-host-12345.invalid")
        ok, msg = SshRemoteSandbox.check_ssh_available(cfg)
        assert not ok


class TestSshSandboxRun:
    """Test run() with mocked SSH commands."""

    def test_run_success(self, tmp_path: Path):
        cfg = SshRemoteConfig(host="fake", user="test")
        sb = SshRemoteSandbox(cfg, tmp_path)

        fake_results = [
            mock.Mock(returncode=0, stdout="", stderr=""),      # mkdir
            mock.Mock(returncode=0, stdout="accuracy: 0.95\nloss: 0.05", stderr=""),  # exec
            mock.Mock(returncode=0, stdout="", stderr=""),      # cleanup
        ]
        call_count = [0]

        def fake_ssh_run(command, *, timeout_sec=60):
            from researchclaw.experiment.ssh_sandbox import _SshResult
            idx = min(call_count[0], len(fake_results) - 1)
            r = fake_results[idx]
            call_count[0] += 1
            return _SshResult(
                returncode=r.returncode,
                stdout=r.stdout,
                stderr=r.stderr,
            )

        def fake_scp(local_dir, remote_dir):
            return True

        with mock.patch.object(sb, '_ssh_run', side_effect=fake_ssh_run):
            with mock.patch.object(sb, '_scp_upload', side_effect=fake_scp):
                result = sb.run("print('hello')", timeout_sec=60)

        assert result.returncode == 0
        assert result.metrics.get("accuracy") == 0.95
        assert result.metrics.get("loss") == 0.05

    def test_run_upload_failure(self, tmp_path: Path):
        cfg = SshRemoteConfig(host="fake", user="test")
        sb = SshRemoteSandbox(cfg, tmp_path)

        from researchclaw.experiment.ssh_sandbox import _SshResult

        with mock.patch.object(sb, '_ssh_run', return_value=_SshResult(0, "", "")):
            with mock.patch.object(sb, '_scp_upload', return_value=False):
                result = sb.run("print('hello')")

        assert result.returncode == -1
        assert "Failed to upload" in result.stderr


# ===========================================================================
# Colab Drive: unit tests
# ===========================================================================

class TestColabDriveCheck:
    def test_empty_root(self):
        cfg = ColabDriveConfig(drive_root="")
        ok, msg = ColabDriveSandbox.check_drive_available(cfg)
        assert not ok
        assert "empty" in msg

    def test_nonexistent_root(self):
        cfg = ColabDriveConfig(drive_root="/nonexistent/path/12345")
        ok, msg = ColabDriveSandbox.check_drive_available(cfg)
        assert not ok
        assert "not found" in msg

    def test_existing_root(self, tmp_path: Path):
        cfg = ColabDriveConfig(drive_root=str(tmp_path))
        ok, msg = ColabDriveSandbox.check_drive_available(cfg)
        assert ok


class TestColabDriveSandbox:
    def test_submit_and_collect(self, tmp_path: Path):
        """Simulate the full flow: submit task → worker picks up → collect result."""
        drive_root = tmp_path / "drive"
        drive_root.mkdir()

        cfg = ColabDriveConfig(
            drive_root=str(drive_root),
            poll_interval_sec=1,
            timeout_sec=10,
        )
        sb = ColabDriveSandbox(cfg, tmp_path / "workdir")

        # Simulate worker in a thread: move pending → done with result
        import threading

        def fake_worker():
            pending = drive_root / "pending"
            done = drive_root / "done"
            for _ in range(20):  # poll for up to 20 seconds
                if pending.exists():
                    for task_dir in pending.iterdir():
                        if task_dir.is_dir():
                            done.mkdir(parents=True, exist_ok=True)
                            done_dir = done / task_dir.name
                            task_dir.rename(done_dir)
                            (done_dir / "result.json").write_text(json.dumps({
                                "returncode": 0,
                                "stdout": "primary_metric: 42.0\naccuracy: 0.99",
                                "stderr": "",
                            }))
                            return
                time.sleep(0.5)

        worker = threading.Thread(target=fake_worker, daemon=True)
        worker.start()

        result = sb.run("print('experiment')", timeout_sec=15)
        worker.join(timeout=5)

        assert result.returncode == 0
        assert result.metrics.get("primary_metric") == 42.0
        assert result.metrics.get("accuracy") == 0.99

    def test_timeout(self, tmp_path: Path):
        """If worker never picks up, should timeout."""
        drive_root = tmp_path / "drive"
        drive_root.mkdir()

        cfg = ColabDriveConfig(
            drive_root=str(drive_root),
            poll_interval_sec=1,
            timeout_sec=3,
        )
        sb = ColabDriveSandbox(cfg, tmp_path / "workdir")
        result = sb.run("print('hello')", timeout_sec=3)

        assert result.timed_out
        assert result.returncode == -1
        assert "did not complete" in result.stderr

    def test_setup_script_written(self, tmp_path: Path):
        drive_root = tmp_path / "drive"
        drive_root.mkdir()

        cfg = ColabDriveConfig(
            drive_root=str(drive_root),
            poll_interval_sec=1,
            timeout_sec=3,
            setup_script="pip install torch -q",
        )
        sb = ColabDriveSandbox(cfg, tmp_path / "workdir")

        # Just submit, don't wait for result
        staging = tmp_path / "workdir" / "_colab_1"
        staging.mkdir(parents=True, exist_ok=True)
        (staging / "main.py").write_text("print('hi')")
        sb._write_setup_script(staging)

        setup_sh = staging / "setup.sh"
        assert setup_sh.exists()
        content = setup_sh.read_text()
        assert "pip install torch -q" in content


class TestColabWorkerTemplate:
    def test_template_not_empty(self):
        assert len(COLAB_WORKER_TEMPLATE) > 100

    def test_template_has_key_elements(self):
        assert "pending" in COLAB_WORKER_TEMPLATE
        assert "done" in COLAB_WORKER_TEMPLATE
        assert "result.json" in COLAB_WORKER_TEMPLATE
        assert "drive.mount" in COLAB_WORKER_TEMPLATE


# ===========================================================================
# Factory integration tests
# ===========================================================================

class TestFactoryIntegration:
    def test_ssh_remote_requires_host(self, tmp_path: Path):
        cfg = _make_experiment_config(
            mode="ssh_remote",
            ssh_remote=SshRemoteConfig(host=""),
        )
        with pytest.raises(RuntimeError, match="host"):
            create_sandbox(cfg, tmp_path)

    def test_ssh_remote_checks_connectivity(self, tmp_path: Path):
        cfg = _make_experiment_config(
            mode="ssh_remote",
            ssh_remote=SshRemoteConfig(host="nonexistent.invalid"),
        )
        with pytest.raises(RuntimeError, match="SSH connectivity"):
            create_sandbox(cfg, tmp_path)

    def test_colab_drive_requires_root(self, tmp_path: Path):
        cfg = _make_experiment_config(
            mode="colab_drive",
            colab_drive=ColabDriveConfig(drive_root=""),
        )
        with pytest.raises(RuntimeError, match="empty"):
            create_sandbox(cfg, tmp_path)

    def test_colab_drive_checks_path(self, tmp_path: Path):
        cfg = _make_experiment_config(
            mode="colab_drive",
            colab_drive=ColabDriveConfig(drive_root="/nonexistent/12345"),
        )
        with pytest.raises(RuntimeError, match="not found"):
            create_sandbox(cfg, tmp_path)

    def test_colab_drive_creates_sandbox(self, tmp_path: Path):
        drive_root = tmp_path / "drive"
        drive_root.mkdir()
        cfg = _make_experiment_config(
            mode="colab_drive",
            colab_drive=ColabDriveConfig(drive_root=str(drive_root)),
        )
        sb = create_sandbox(cfg, tmp_path / "workdir")
        assert isinstance(sb, ColabDriveSandbox)


# ===========================================================================
# ACP timeout fix test
# ===========================================================================

class TestAcpTimeoutFix:
    def test_timeout_passed_from_config(self):
        from researchclaw.config import RCConfig, AcpConfig, LlmConfig
        from researchclaw.llm.acp_client import ACPClient, ACPConfig

        acp_cfg = AcpConfig(agent="codex", timeout_sec=1500)
        llm_cfg = LlmConfig(provider="acp", acp=acp_cfg)

        # Simulate RCConfig with just the fields ACPClient.from_rc_config uses
        fake_rc = mock.Mock()
        fake_rc.llm = llm_cfg

        client = ACPClient.from_rc_config(fake_rc)
        assert client.config.timeout_sec == 1500

    def test_timeout_default(self):
        from researchclaw.llm.acp_client import ACPClient

        fake_rc = mock.Mock()
        fake_rc.llm.acp.agent = "claude"
        fake_rc.llm.acp.cwd = "."
        fake_rc.llm.acp.acpx_command = ""
        fake_rc.llm.acp.session_name = "test"
        fake_rc.llm.acp.timeout_sec = 600

        client = ACPClient.from_rc_config(fake_rc)
        assert client.config.timeout_sec == 600


# ===========================================================================
# ACP session reconnect tests (Issue #52)
# ===========================================================================

class TestAcpSessionReconnect:
    def test_reconnect_on_session_died(self):
        """_send_prompt retries when session dies with 'agent needs reconnect'."""
        from researchclaw.llm.acp_client import ACPClient, ACPConfig

        client = ACPClient(ACPConfig(agent="claude"))
        client._acpx = "/usr/bin/true"
        client._session_ready = True

        call_count = 0

        def fake_cli(acpx: str, prompt: str) -> str:
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                raise RuntimeError("ACP prompt failed (exit 1): agent needs reconnect")
            return "success response"

        client._send_prompt_cli = fake_cli  # type: ignore[assignment]
        client._ensure_session = lambda: None  # type: ignore[assignment]
        client._force_reconnect = lambda: None  # type: ignore[assignment]

        result = client._send_prompt("test prompt")
        assert result == "success response"
        assert call_count == 2

    def test_reconnect_exhausted_raises(self):
        """_send_prompt raises after exhausting reconnect attempts."""
        from researchclaw.llm.acp_client import ACPClient, ACPConfig

        client = ACPClient(ACPConfig(agent="claude"))
        client._acpx = "/usr/bin/true"
        client._session_ready = True

        def always_fail(acpx: str, prompt: str) -> str:
            raise RuntimeError("ACP prompt failed (exit 1): session not found")

        client._send_prompt_cli = always_fail  # type: ignore[assignment]
        client._ensure_session = lambda: None  # type: ignore[assignment]
        client._force_reconnect = lambda: None  # type: ignore[assignment]

        import pytest
        with pytest.raises(RuntimeError, match="session not found"):
            client._send_prompt("test prompt")

    def test_non_reconnectable_error_raises_immediately(self):
        """_send_prompt does not retry on non-session errors."""
        from researchclaw.llm.acp_client import ACPClient, ACPConfig

        client = ACPClient(ACPConfig(agent="claude"))
        client._acpx = "/usr/bin/true"
        client._session_ready = True

        call_count = 0

        def fail_with_other_error(acpx: str, prompt: str) -> str:
            nonlocal call_count
            call_count += 1
            raise RuntimeError("ACP prompt failed (exit 1): permission denied")

        client._send_prompt_cli = fail_with_other_error  # type: ignore[assignment]
        client._ensure_session = lambda: None  # type: ignore[assignment]

        import pytest
        with pytest.raises(RuntimeError, match="permission denied"):
            client._send_prompt("test prompt")
        assert call_count == 1  # no retry


================================================
FILE: tests/test_trends.py
================================================
"""Tests for researchclaw.trends — Research Trend Tracker (Agent D1).

25+ tests covering feeds, trend_analyzer, opportunity_finder,
daily_digest, auto_topic, and literature/trends.
"""

from __future__ import annotations

import asyncio
from datetime import date
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.trends.feeds import FeedManager
from researchclaw.trends.trend_analyzer import TrendAnalyzer, _STOPWORDS
from researchclaw.trends.opportunity_finder import OpportunityFinder
from researchclaw.trends.daily_digest import DailyDigest
from researchclaw.trends.auto_topic import AutoTopicGenerator
from researchclaw.literature.trends import LiteratureTrendAnalyzer


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


def _make_papers(n: int = 10) -> list[dict[str, Any]]:
    """Generate synthetic papers for testing."""
    papers = []
    for i in range(n):
        papers.append({
            "title": f"Transformer attention mechanism for graph neural networks part {i}",
            "authors": [
                {"name": "Alice Smith"},
                {"name": "Bob Jones"},
            ] if i % 2 == 0 else ["Alice Smith", "Charlie Brown"],
            "abstract": (
                "We propose a transformer-based attention approach for "
                "graph neural networks using contrastive learning on ImageNet "
                "and CIFAR datasets. Our diffusion model achieves SOTA results."
            ),
            "url": f"https://arxiv.org/abs/2026.{i:05d}",
            "source": "arxiv" if i % 2 == 0 else "semantic_scholar",
            "published_date": "2026-03-01",
        })
    return papers


class MockLLM:
    async def chat_async(self, prompt: str) -> str:
        return (
            "TOPIC: Graph transformers for drug discovery | "
            "WHY: Rising trend | FEASIBILITY: high\n"
            "TOPIC: Diffusion models for 3D generation | "
            "WHY: New paradigm | FEASIBILITY: medium\n"
        )


class FailingLLM:
    async def chat_async(self, prompt: str) -> str:
        raise RuntimeError("API error")


# ===================================================================
# FeedManager tests
# ===================================================================


class TestFeedManager:
    def test_init_filters_supported_sources(self):
        fm = FeedManager(sources=("arxiv", "invalid_source", "semantic_scholar"))
        assert fm.sources == ("arxiv", "semantic_scholar")

    def test_supported_sources(self):
        assert "arxiv" in FeedManager.SUPPORTED_SOURCES
        assert "semantic_scholar" in FeedManager.SUPPORTED_SOURCES
        assert "openalex" in FeedManager.SUPPORTED_SOURCES

    def test_fetch_deduplicates_by_title(self):
        fm = FeedManager(sources=("arxiv",))
        # Mock _fetch_arxiv to return duplicates
        papers = [
            {"title": "Same Title", "source": "arxiv"},
            {"title": "Same Title", "source": "arxiv"},
            {"title": "Different Title", "source": "arxiv"},
        ]
        with patch.object(fm, "_fetch_arxiv", return_value=papers):
            result = fm.fetch_recent_papers(["ml"], max_papers=10)
        assert len(result) == 2

    def test_fetch_respects_max_papers(self):
        fm = FeedManager(sources=("arxiv",))
        papers = [{"title": f"Paper {i}", "source": "arxiv"} for i in range(20)]
        with patch.object(fm, "_fetch_arxiv", return_value=papers):
            result = fm.fetch_recent_papers(["ml"], max_papers=5)
        assert len(result) == 5

    def test_fetch_handles_source_failure(self):
        fm = FeedManager(sources=("arxiv",))
        with patch.object(fm, "_fetch_arxiv", side_effect=RuntimeError("fail")):
            result = fm.fetch_recent_papers(["ml"])
        assert result == []

    def test_fetch_empty_title_excluded(self):
        fm = FeedManager(sources=("arxiv",))
        papers = [
            {"title": "", "source": "arxiv"},
            {"title": "  ", "source": "arxiv"},
            {"title": "Valid Paper", "source": "arxiv"},
        ]
        with patch.object(fm, "_fetch_arxiv", return_value=papers):
            result = fm.fetch_recent_papers(["ml"])
        assert len(result) == 1


# ===================================================================
# TrendAnalyzer tests
# ===================================================================


class TestTrendAnalyzer:
    def test_analyze_empty(self):
        analyzer = TrendAnalyzer()
        result = analyzer.analyze([])
        assert result["paper_count"] == 0
        assert result["rising_keywords"] == []

    def test_analyze_extracts_keywords(self):
        analyzer = TrendAnalyzer()
        papers = _make_papers(10)
        result = analyzer.analyze(papers)
        assert result["paper_count"] == 10
        assert len(result["rising_keywords"]) > 0

    def test_keywords_exclude_stopwords(self):
        analyzer = TrendAnalyzer()
        papers = _make_papers(10)
        result = analyzer.analyze(papers)
        for kw in result["rising_keywords"]:
            for word in kw["keyword"].split():
                assert word not in _STOPWORDS

    def test_extract_authors_dict_format(self):
        analyzer = TrendAnalyzer()
        papers = [
            {"authors": [{"name": "Alice"}, {"name": "Bob"}]} for _ in range(5)
        ]
        authors = analyzer._extract_authors(papers)
        assert any(a["author"] == "Alice" for a in authors)

    def test_extract_authors_string_format(self):
        analyzer = TrendAnalyzer()
        papers = [{"authors": ["Alice", "Bob"]} for _ in range(5)]
        authors = analyzer._extract_authors(papers)
        assert any(a["author"] == "Alice" for a in authors)

    def test_extract_datasets(self):
        analyzer = TrendAnalyzer()
        papers = [
            {"title": "Training on ImageNet and CIFAR", "abstract": ""},
            {"title": "MNIST results", "abstract": "evaluated on GLUE benchmark"},
        ]
        datasets = analyzer._extract_datasets(papers)
        ds_names = {d["dataset"] for d in datasets}
        assert "ImageNet" in ds_names
        assert "CIFAR" in ds_names

    def test_extract_methods(self):
        analyzer = TrendAnalyzer()
        papers = [
            {"title": "Transformer attention", "abstract": "using diffusion models"},
            {"title": "GAN for images", "abstract": "contrastive learning approach"},
        ]
        methods = analyzer._extract_methods(papers)
        method_names = {m["method"] for m in methods}
        assert "transformer" in method_names or "attention" in method_names

    def test_tokenize(self):
        tokens = TrendAnalyzer._tokenize("Hello World! It's a test-case.")
        assert "hello" in tokens
        assert "world" in tokens
        assert "it's" in tokens
        assert "test-case" in tokens

    def test_source_distribution(self):
        papers = [
            {"source": "arxiv"},
            {"source": "arxiv"},
            {"source": "semantic_scholar"},
        ]
        dist = TrendAnalyzer._source_distribution(papers)
        assert dist["arxiv"] == 2
        assert dist["semantic_scholar"] == 1

    def test_generate_trend_report(self):
        analyzer = TrendAnalyzer()
        analysis = analyzer.analyze(_make_papers(10))
        report = analyzer.generate_trend_report(analysis)
        assert "Research Trend Analysis" in report
        assert "10 papers" in report

    def test_min_keyword_length(self):
        analyzer = TrendAnalyzer(min_keyword_length=5)
        papers = [{"title": "AI is a big deal", "abstract": ""}] * 5
        keywords = analyzer._extract_keywords(papers)
        # Short words like "deal" (4 chars) should be excluded by min_keyword_length=5
        # but "big" is only 3 chars so excluded too
        for kw in keywords:
            for word in kw["keyword"].split():
                assert len(word) >= 5 or word in _STOPWORDS


# ===================================================================
# OpportunityFinder tests
# ===================================================================


class TestOpportunityFinder:
    def test_heuristic_no_llm(self):
        finder = OpportunityFinder()
        trend_analysis = {
            "rising_keywords": [
                {"keyword": "graph neural", "count": 10},
                {"keyword": "attention", "count": 8},
                {"keyword": "diffusion", "count": 6},
            ],
            "method_trends": [
                {"method": "transformer", "mention_count": 12},
                {"method": "contrastive learning", "mention_count": 7},
            ],
        }
        result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"]))
        assert len(result) > 0
        assert all("topic" in opp for opp in result)
        assert all(opp["source"] == "heuristic" for opp in result)

    def test_heuristic_empty_trends(self):
        finder = OpportunityFinder()
        result = asyncio.run(finder.find_opportunities(
            {"rising_keywords": [], "method_trends": []}, ["ml"]
        ))
        assert result == []

    def test_llm_path(self):
        finder = OpportunityFinder(llm_client=MockLLM())
        trend_analysis = {
            "rising_keywords": [{"keyword": "graph", "count": 10}],
            "method_trends": [{"method": "transformer", "mention_count": 5}],
        }
        result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"]))
        assert len(result) >= 1
        assert result[0]["source"] == "llm"

    def test_llm_fallback_on_failure(self):
        finder = OpportunityFinder(llm_client=FailingLLM())
        trend_analysis = {
            "rising_keywords": [{"keyword": "test", "count": 5}],
            "method_trends": [{"method": "GAN", "mention_count": 3}],
        }
        result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"]))
        assert all(opp["source"] == "heuristic" for opp in result)

    def test_parse_opportunities(self):
        response = (
            "TOPIC: Adaptive transformers | WHY: Trending | FEASIBILITY: high\n"
            "TOPIC: Diffusion for audio | WHY: New area | FEASIBILITY: medium\n"
            "Some noise line\n"
        )
        result = OpportunityFinder._parse_opportunities(response)
        assert len(result) == 2
        assert result[0]["topic"] == "Adaptive transformers"
        assert result[0]["feasibility"] == "high"

    def test_heuristic_max_five(self):
        finder = OpportunityFinder()
        trend_analysis = {
            "rising_keywords": [
                {"keyword": f"kw{i}", "count": 10} for i in range(10)
            ],
            "method_trends": [
                {"method": f"method{i}", "mention_count": 5} for i in range(10)
            ],
        }
        result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"]))
        assert len(result) <= 5


# ===================================================================
# DailyDigest tests
# ===================================================================


class TestDailyDigest:
    def test_generate_basic_no_papers(self):
        fm = FeedManager(sources=())
        digest = DailyDigest(fm)
        result = asyncio.run(digest.generate(["ml"]))
        assert "No new papers found" in result

    def test_generate_basic_with_papers(self):
        fm = FeedManager(sources=("arxiv",))
        papers = _make_papers(3)
        with patch.object(fm, "fetch_recent_papers", return_value=papers):
            digest = DailyDigest(fm)
            result = asyncio.run(digest.generate(["ml"]))
        assert "Daily Paper Digest" in result
        assert "Papers found: 3" in result

    def test_generate_basic_truncates_abstract(self):
        fm = FeedManager(sources=("arxiv",))
        papers = [{"title": "Test", "abstract": "x" * 500, "authors": [], "url": ""}]
        with patch.object(fm, "fetch_recent_papers", return_value=papers):
            digest = DailyDigest(fm)
            result = asyncio.run(digest.generate(["ml"]))
        assert "..." in result

    def test_parse_summary_valid(self):
        response = "SUMMARY: Great paper on attention | RELEVANCE: 4"
        summary, relevance = DailyDigest._parse_summary(response)
        assert summary == "Great paper on attention"
        assert relevance == 4

    def test_parse_summary_no_format(self):
        response = "Just a plain text response."
        summary, relevance = DailyDigest._parse_summary(response)
        assert summary == response
        assert relevance == 3  # default

    def test_parse_summary_clamped(self):
        response = "SUMMARY: x | RELEVANCE: 99"
        _, relevance = DailyDigest._parse_summary(response)
        assert relevance == 5

    def test_generate_and_save(self, tmp_path: Path):
        fm = FeedManager(sources=("arxiv",))
        papers = _make_papers(2)
        with patch.object(fm, "fetch_recent_papers", return_value=papers):
            digest = DailyDigest(fm)
            result_path = asyncio.run(digest.generate_and_save(tmp_path, ["ml"]))
        assert result_path.exists()
        assert result_path.read_text(encoding="utf-8").startswith("## Daily Paper Digest")

    def test_author_formatting_dict(self):
        fm = FeedManager(sources=("arxiv",))
        papers = [{
            "title": "T",
            "abstract": "",
            "url": "",
            "authors": [{"name": "A"}, {"name": "B"}, {"name": "C"}, {"name": "D"}],
        }]
        with patch.object(fm, "fetch_recent_papers", return_value=papers):
            digest = DailyDigest(fm)
            result = asyncio.run(digest.generate(["ml"]))
        assert "et al." in result


# ===================================================================
# AutoTopicGenerator tests
# ===================================================================


class TestAutoTopicGenerator:
    def test_generate_candidates(self):
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        gen = AutoTopicGenerator(analyzer, finder)
        papers = _make_papers(10)
        candidates = asyncio.run(gen.generate_candidates(["ml"], papers, count=3))
        assert len(candidates) <= 3
        if candidates:
            assert "topic" in candidates[0]
            assert "overall_score" in candidates[0]

    def test_generate_candidates_empty(self):
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        gen = AutoTopicGenerator(analyzer, finder)
        candidates = asyncio.run(gen.generate_candidates(["ml"], [], count=3))
        # With empty papers, heuristic has no keywords/methods → no opportunities
        assert isinstance(candidates, list)

    def test_auto_select_default_fallback(self):
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        gen = AutoTopicGenerator(analyzer, finder)
        result = asyncio.run(gen.auto_select(["ml"], []))
        assert "topic" in result
        assert result["source"] == "default"

    def test_score_candidate_feasibility(self):
        opp_high = {"topic": "unique topic xyz", "feasibility": "high"}
        opp_low = {"topic": "unique topic xyz", "feasibility": "low"}
        trend = {"rising_keywords": [], "paper_count": 50}
        score_h = AutoTopicGenerator._score_candidate(opp_high, trend)
        score_l = AutoTopicGenerator._score_candidate(opp_low, trend)
        assert score_h["feasibility"] == 0.9
        assert score_l["feasibility"] == 0.3
        assert score_h["overall"] > score_l["overall"]

    def test_score_candidate_novelty_decay(self):
        opp = {"topic": "graph neural", "feasibility": "medium"}
        trend = {
            "rising_keywords": [
                {"keyword": "graph neural", "count": 10},
                {"keyword": "neural network", "count": 8},
            ],
            "paper_count": 50,
        }
        score = AutoTopicGenerator._score_candidate(opp, trend)
        assert score["novelty"] < 1.0  # overlap penalizes novelty

    def test_score_candidate_weights(self):
        """Overall = 0.4*novelty + 0.3*feasibility + 0.3*impact."""
        opp = {"topic": "totally unique xyz", "feasibility": "high"}
        trend = {"rising_keywords": [], "paper_count": 50}
        score = AutoTopicGenerator._score_candidate(opp, trend)
        expected = round(0.4 * score["novelty"] + 0.3 * score["feasibility"] + 0.3 * score["impact"], 3)
        assert score["overall"] == expected

    def test_format_candidates_empty(self):
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        gen = AutoTopicGenerator(analyzer, finder)
        assert "No candidate" in gen.format_candidates([])

    def test_format_candidates_with_data(self):
        analyzer = TrendAnalyzer()
        finder = OpportunityFinder()
        gen = AutoTopicGenerator(analyzer, finder)
        candidates = [
            {
                "topic": "Test topic",
                "overall_score": 0.75,
                "novelty_score": 0.8,
                "feasibility_score": 0.7,
                "impact_score": 0.6,
                "rationale": "Good idea",
            }
        ]
        output = gen.format_candidates(candidates)
        assert "Test topic" in output
        assert "0.75" in output


# ===================================================================
# LiteratureTrendAnalyzer tests
# ===================================================================


class TestLiteratureTrendAnalyzer:
    def test_no_client_returns_empty(self):
        lta = LiteratureTrendAnalyzer()
        assert lta.get_daily_papers(["ml"]) == []

    def test_analyze_keyword_trends_no_client(self):
        lta = LiteratureTrendAnalyzer()
        result = lta.analyze_keyword_trends(["ml"])
        assert result["total_papers"] == 0

    def test_find_emerging_topics_no_client(self):
        lta = LiteratureTrendAnalyzer()
        assert lta.find_emerging_topics(["ml"]) == []

    def test_find_emerging_topics_filters_bigrams(self):
        """Only bigrams with count >= 3 are considered emerging."""
        lta = LiteratureTrendAnalyzer(search_client="fake")
        papers = _make_papers(20)
        with patch.object(lta, "get_daily_papers", return_value=papers):
            topics = lta.find_emerging_topics(["ml"])
        for t in topics:
            assert t["type"] == "bigram"
            assert t["frequency"] >= 3


================================================
FILE: tests/test_universal_codegen_integration.py
================================================
"""Integration tests for universal cross-domain code generation.

Tests the full pipeline from domain detection → adapter selection →
prompt block generation → blueprint context building, across multiple
research domains. These tests do NOT require an LLM or network —
they verify the infrastructure wiring.
"""

from __future__ import annotations

import json
import pytest
from pathlib import Path
from unittest.mock import MagicMock, patch

from researchclaw.domains.detector import (
    DomainProfile,
    detect_domain,
    get_profile,
    is_ml_domain,
    load_all_profiles,
)
from researchclaw.domains.prompt_adapter import get_adapter, PromptBlocks
from researchclaw.domains.experiment_schema import (
    Condition,
    ConditionRole,
    EvaluationSpec,
    MetricSpec,
    UniversalExperimentPlan,
    from_legacy_exp_plan,
)
from researchclaw.experiment.metrics import UniversalMetricParser
from researchclaw.experiment.evaluators.convergence import analyze_convergence
from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult
from researchclaw.agents.code_searcher.pattern_extractor import CodePatterns


# ---------------------------------------------------------------------------
# Cross-domain domain detection integration
# ---------------------------------------------------------------------------


class TestCrossDomainDetection:
    """Test domain detection across all supported domains."""

    def test_all_profiles_loadable(self):
        profiles = load_all_profiles()
        assert len(profiles) >= 18  # at least 18 domain profiles

    def test_ml_vision_full_pipeline(self):
        """ML Vision: detect → adapter → blocks → legacy compatibility."""
        profile = detect_domain("image classification on CIFAR-10 with ResNet")
        assert profile.domain_id == "ml_vision"
        assert is_ml_domain(profile)

        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        # ML adapter returns empty blocks (existing behavior)
        assert blocks.compute_budget == ""

    def test_physics_pde_full_pipeline(self):
        """Physics PDE: detect → adapter → blocks with convergence guidance."""
        profile = detect_domain("finite element method for Poisson equation")
        assert profile.domain_id == "physics_pde"
        assert not is_ml_domain(profile)

        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        # Physics adapter should provide non-empty guidance
        assert blocks.code_generation_hints  # not empty

        # Blueprint context should mention convergence
        ctx = adapter.get_blueprint_context()
        assert ctx  # not empty

    def test_economics_full_pipeline(self):
        """Economics: detect → adapter → progressive spec guidance."""
        profile = detect_domain("panel data regression with instrumental variables")
        assert profile.domain_id == "economics_empirical"

        adapter = get_adapter(profile)
        blocks = adapter.get_experiment_design_blocks({})
        assert "progressive" in blocks.experiment_design_context.lower()

    def test_chemistry_full_pipeline(self):
        """Chemistry: detect → adapter → PySCF guidance."""
        profile = detect_domain("DFT calculation with PySCF for molecular energies")
        assert profile.domain_id == "chemistry_qm"

        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints

    def test_biology_full_pipeline(self):
        """Biology: detect → adapter → scanpy guidance."""
        profile = detect_domain("single-cell RNA-seq clustering with scanpy")
        assert profile.domain_id == "biology_singlecell"

        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints

    def test_math_full_pipeline(self):
        """Math: detect → adapter → convergence guidance."""
        profile = detect_domain("Runge-Kutta ODE solver convergence analysis")
        assert profile.domain_id == "mathematics_numerical"

        adapter = get_adapter(profile)
        blocks = adapter.get_code_generation_blocks({})
        assert blocks.code_generation_hints


# ---------------------------------------------------------------------------
# Universal Experiment Schema integration
# ---------------------------------------------------------------------------


class TestExperimentSchemaIntegration:
    def test_physics_convergence_plan(self):
        """Create a physics convergence study plan."""
        plan = UniversalExperimentPlan(
            experiment_type="convergence",
            domain_id="physics_pde",
            problem_description="Solve Poisson equation with FEM and FDM",
            conditions=[
                Condition(name="FDM_2nd", role="reference",
                          description="2nd order finite difference"),
                Condition(name="FEM_P1", role="proposed",
                          description="P1 finite element method"),
                Condition(name="FEM_P2", role="variant",
                          varies_from="FEM_P1",
                          description="P2 finite element method"),
            ],
            evaluation=EvaluationSpec(
                primary_metric=MetricSpec(
                    name="l2_error",
                    direction="minimize",
                    unit="relative",
                ),
                protocol="Run at 5 grid sizes, measure L2 error",
                statistical_test="convergence_order_fit",
                num_seeds=1,
            ),
            main_figure_type="convergence_plot",
        )

        assert len(plan.references) == 1
        assert len(plan.proposed) == 1
        assert len(plan.variants) == 1

        # Test legacy format conversion
        legacy = plan.to_legacy_format()
        assert len(legacy["baselines"]) == 1
        assert legacy["baselines"][0]["name"] == "FDM_2nd"
        assert "l2_error" in legacy["metrics"]

        # Test YAML serialization
        yaml_str = plan.to_yaml()
        assert "convergence" in yaml_str
        assert "FDM_2nd" in yaml_str

    def test_economics_progressive_plan(self):
        """Create an economics progressive specification plan."""
        plan = UniversalExperimentPlan(
            experiment_type="progressive_spec",
            domain_id="economics_empirical",
            conditions=[
                Condition(name="OLS", role="reference",
                          description="Simple OLS"),
                Condition(name="OLS_controls", role="proposed",
                          description="OLS with control variables"),
                Condition(name="FE", role="variant",
                          varies_from="OLS_controls",
                          description="Fixed effects"),
                Condition(name="IV_2SLS", role="variant",
                          varies_from="OLS_controls",
                          description="Instrumental variables"),
            ],
            evaluation=EvaluationSpec(
                primary_metric=MetricSpec(name="coefficient", direction="maximize"),
                statistical_test="hausman_test",
            ),
            main_table_type="regression_table",
        )
        assert len(plan.conditions) == 4
        legacy = plan.to_legacy_format()
        assert len(legacy["ablations"]) == 2  # FE and IV are variants


# ---------------------------------------------------------------------------
# Metric Parser + Convergence Evaluator integration
# ---------------------------------------------------------------------------


class TestMetricConvergenceIntegration:
    def test_json_convergence_end_to_end(self, tmp_path):
        """Parse JSON convergence results → analyze convergence → report."""
        data = {
            "experiment_type": "convergence",
            "convergence": {
                "euler": [
                    {"h": 0.1, "error": 0.1},
                    {"h": 0.05, "error": 0.05},
                    {"h": 0.025, "error": 0.025},
                    {"h": 0.0125, "error": 0.0125},
                ],
                "rk4": [
                    {"h": 0.1, "error": 1e-4},
                    {"h": 0.05, "error": 6.25e-6},
                    {"h": 0.025, "error": 3.9e-7},
                    {"h": 0.0125, "error": 2.44e-8},
                ],
            },
            "metadata": {"domain": "mathematics_numerical"},
        }
        (tmp_path / "results.json").write_text(json.dumps(data))

        # Parse
        parser = UniversalMetricParser()
        results = parser.parse(tmp_path)
        assert results.source == "json"
        assert "euler" in results.convergence

        # Analyze convergence
        report = analyze_convergence(
            results.convergence,
            expected_orders={"euler": 1.0, "rk4": 4.0},
        )
        assert len(report.methods) == 2

        euler = next(r for r in report.methods if r.method == "euler")
        rk4 = next(r for r in report.methods if r.method == "rk4")

        assert abs(euler.convergence_order - 1.0) < 0.2
        assert abs(rk4.convergence_order - 4.0) < 0.5
        assert rk4.convergence_order > euler.convergence_order
        assert report.best_method == "rk4"

    def test_flat_metrics_backward_compatible(self, tmp_path):
        """Ensure new metric parser produces backward-compatible output."""
        # Write old-style stdout
        result = UniversalMetricParser().parse(
            tmp_path,
            stdout="accuracy: 0.95\nloss: 0.32\ncondition=proposed accuracy: 0.95\n",
        )
        flat = result.to_flat_metrics()
        assert "accuracy" in flat
        assert "loss" in flat
        assert flat["accuracy"] == 0.95


# ---------------------------------------------------------------------------
# Code Search + Domain Profile integration
# ---------------------------------------------------------------------------


class TestCodeSearchIntegration:
    def test_code_search_result_in_blueprint(self):
        """Code search results should be formattable as prompt context."""
        result = CodeSearchResult(
            patterns=CodePatterns(
                api_patterns=[
                    "from pyscf import gto, scf\nmol = gto.M(atom='H 0 0 0; H 0 0 0.74', basis='sto-3g')",
                ],
                file_structure={"main.py": "Entry point", "molecule.py": "Molecule definitions"},
                evaluation_patterns=["mae = np.mean(np.abs(predicted - reference))"],
            ),
            repos_found=[
                MagicMock(full_name="user/pyscf-example", stars=200),
            ],
        )
        ctx = result.to_prompt_context()
        assert "pyscf" in ctx
        assert "molecule.py" in ctx

    def test_domain_adapter_blueprint_context(self):
        """Domain adapter should produce useful blueprint context."""
        profile = get_profile("physics_simulation")
        if profile is None:
            pytest.skip("physics_simulation profile not found")

        adapter = get_adapter(profile)
        ctx = adapter.get_blueprint_context()

        # Should mention file structure
        assert "main.py" in ctx or "integrator" in ctx.lower()
        # Should mention libraries
        assert "numpy" in ctx.lower() or "scipy" in ctx.lower() or ctx != ""


# ---------------------------------------------------------------------------
# CodeAgent domain injection test
# ---------------------------------------------------------------------------


class TestCodeAgentDomainInjection:
    def test_code_agent_accepts_domain_profile(self):
        """CodeAgent should accept domain_profile and code_search_result."""
        from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig

        config = CodeAgentConfig(enabled=True)
        profile = DomainProfile(
            domain_id="physics_pde",
            display_name="PDE Solvers",
            core_libraries=["numpy", "scipy"],
        )
        search_result = CodeSearchResult(
            patterns=CodePatterns(
                api_patterns=["import scipy.sparse"],
            ),
        )

        agent = CodeAgent(
            llm=MagicMock(),
            prompts=MagicMock(),
            config=config,
            stage_dir=Path("/tmp/test"),
            domain_profile=profile,
            code_search_result=search_result,
        )

        # Verify the domain context builder works
        ctx = agent._build_domain_context()
        assert "scipy" in ctx.lower() or ctx != ""

    def test_code_agent_ml_domain_no_extra_context(self):
        """ML domain should add minimal extra context (preserve existing behavior)."""
        from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig

        config = CodeAgentConfig(enabled=True)
        profile = get_profile("ml_vision") or DomainProfile(
            domain_id="ml_vision",
            display_name="Computer Vision",
        )

        agent = CodeAgent(
            llm=MagicMock(),
            prompts=MagicMock(),
            config=config,
            stage_dir=Path("/tmp/test"),
            domain_profile=profile,
            code_search_result=None,  # No code search for ML
        )

        # ML adapter returns empty blocks → minimal context
        ctx = agent._build_domain_context()
        # It's acceptable for ML to have some context from file structure,
        # but it should NOT have code search results
        # (we didn't provide code_search_result)
        assert "Reference Code from GitHub" not in ctx


# ---------------------------------------------------------------------------
# Docker profile mapping test
# ---------------------------------------------------------------------------


class TestDockerProfileMapping:
    def test_domain_to_docker_mapping(self):
        """All domains should map to a valid docker profile."""
        import yaml

        profiles_path = Path(__file__).parent.parent / "researchclaw" / "data" / "docker_profiles.yaml"
        if not profiles_path.exists():
            pytest.skip("docker_profiles.yaml not found")

        with profiles_path.open() as f:
            docker_config = yaml.safe_load(f)

        domain_map = docker_config.get("domain_map", {})
        profiles = docker_config.get("profiles", {})

        # Every mapped domain should point to a valid profile
        for domain_id, profile_name in domain_map.items():
            assert profile_name in profiles, (
                f"Domain {domain_id} maps to unknown profile: {profile_name}"
            )

    def test_all_loaded_domains_have_docker_mapping(self):
        """All domain profiles should have a docker mapping."""
        import yaml

        profiles_path = Path(__file__).parent.parent / "researchclaw" / "data" / "docker_profiles.yaml"
        if not profiles_path.exists():
            pytest.skip("docker_profiles.yaml not found")

        with profiles_path.open() as f:
            docker_config = yaml.safe_load(f)

        domain_map = docker_config.get("domain_map", {})
        domain_profiles = load_all_profiles()

        unmapped = []
        for domain_id in domain_profiles:
            if domain_id not in domain_map and domain_id != "generic":
                unmapped.append(domain_id)

        # Allow some unmapped (new domains without docker images yet)
        # but the core ones should be mapped
        core_domains = [
            "ml_vision", "ml_nlp", "ml_rl", "physics_simulation",
            "physics_pde", "chemistry_qm", "economics_empirical",
            "mathematics_numerical",
        ]
        for d in core_domains:
            assert d in domain_map, f"Core domain {d} missing from docker mapping"


================================================
FILE: tests/test_v6_improvements.py
================================================
"""Tests for V6 improvements (IMP-13 through IMP-16).

Run with:
    .venv/bin/python3 -m pytest tests/test_v6_improvements.py -v
or:
    .venv/bin/python3 tests/test_v6_improvements.py
"""
from __future__ import annotations

import re
import sys
import statistics
import random
import textwrap
from pathlib import Path

# ============================================================
# IMP-13: Test _extract_paper_title import & behaviour
# ============================================================

class TestIMP13_ExtractPaperTitle:
    """IMP-13: runner.py imports _extract_paper_title from executor.
    Verify the import works and the function produces correct results."""

    def test_import_works(self):
        """The import `from researchclaw.pipeline.executor import _extract_paper_title`
        must succeed — runner.py line 394 depends on it."""
        from researchclaw.pipeline.executor import _extract_paper_title
        assert callable(_extract_paper_title), "_extract_paper_title should be callable"
        print("[IMP-13] PASS: import _extract_paper_title works")

    def test_extracts_h1_title(self):
        from researchclaw.pipeline.executor import _extract_paper_title
        md = textwrap.dedent("""\
            # A Novel Approach to Deep Reinforcement Learning

            ## Abstract

            This paper presents...
        """)
        title = _extract_paper_title(md)
        assert title == "A Novel Approach to Deep Reinforcement Learning", \
            f"Expected H1 title, got: {title!r}"
        print(f"[IMP-13] PASS: extracted title = {title!r}")

    def test_skips_abstract_heading(self):
        """Title before Abstract should be found; Abstract heading itself skipped."""
        from researchclaw.pipeline.executor import _extract_paper_title
        md = textwrap.dedent("""\
            # A Real Title of at Least Four Words

            ## Abstract

            Some text...
        """)
        title = _extract_paper_title(md)
        # "Abstract" should be skipped; the real title (before Abstract) is found
        assert title == "A Real Title of at Least Four Words", \
            f"Expected real title, got: {title!r}"
        print(f"[IMP-13] PASS: skipped Abstract, got title = {title!r}")

    def test_title_after_abstract_not_found(self):
        """If the only real title is AFTER Abstract, it should not be found
        (function searches only before Abstract heading)."""
        from researchclaw.pipeline.executor import _extract_paper_title
        md = textwrap.dedent("""\
            # Abstract

            # A Title That Appears After Abstract

            Some text...
        """)
        title = _extract_paper_title(md)
        # Title after Abstract is not in the search region, so fallback
        assert title == "Untitled Paper", \
            f"Expected 'Untitled Paper' since title is after Abstract, got: {title!r}"
        print(f"[IMP-13] PASS: title after Abstract not found, fallback = {title!r}")

    def test_fallback_untitled(self):
        from researchclaw.pipeline.executor import _extract_paper_title
        md = "Just some text without any headings."
        title = _extract_paper_title(md)
        assert title == "Untitled Paper", f"Expected 'Untitled Paper', got: {title!r}"
        print(f"[IMP-13] PASS: fallback = {title!r}")

    def test_bold_title(self):
        from researchclaw.pipeline.executor import _extract_paper_title
        md = textwrap.dedent("""\
            **A Bold Title for This Paper**

            ## Abstract

            Text here...
        """)
        title = _extract_paper_title(md)
        assert "Bold Title" in title, f"Expected bold title, got: {title!r}"
        print(f"[IMP-13] PASS: bold title = {title!r}")


# ============================================================
# IMP-14: Test orphaned cite-key stripping logic
# ============================================================

class TestIMP14_StripOrphanedCites:
    """IMP-14: After packaging, any \\cite{key} where key is not in
    references.bib should be stripped from paper.tex."""

    @staticmethod
    def _run_cite_stripping(tex_text: str, bib_text: str) -> str:
        """Reproduce the IMP-14 logic from runner.py lines 505-532."""
        all_cite_keys: set[str] = set()
        for cm in re.finditer(r"\\cite\{([^}]+)\}", tex_text):
            all_cite_keys.update(k.strip() for k in cm.group(1).split(","))
        bib_keys = set(re.findall(r"@\w+\{([^,]+),", bib_text))
        missing = all_cite_keys - bib_keys

        if missing:
            def _filter_cite(m: re.Match[str]) -> str:
                keys = [k.strip() for k in m.group(1).split(",")]
                kept = [k for k in keys if k not in missing]
                if not kept:
                    return ""
                return "\\cite{" + ", ".join(kept) + "}"

            tex_text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text)
            tex_text = re.sub(r"  +", " ", tex_text)
            tex_text = re.sub(r" ([.,;:)])", r"\1", tex_text)
        return tex_text

    def test_mixed_real_and_missing_keys(self):
        """\\cite{real_key, missing_key} should become \\cite{real_key}."""
        tex = r"Some text \cite{real_key, missing_key} and more."
        bib = textwrap.dedent("""\
            @article{real_key,
              author = {Doe},
              title = {Real Paper},
              year = {2024},
            }
        """)
        result = self._run_cite_stripping(tex, bib)
        assert r"\cite{real_key}" in result, f"Expected \\cite{{real_key}}, got: {result!r}"
        assert "missing_key" not in result, f"missing_key should be gone: {result!r}"
        print(f"[IMP-14] PASS: mixed keys → {result!r}")

    def test_all_keys_missing(self):
        """\\cite{missing1, missing2} should be entirely removed."""
        tex = r"Some text \cite{missing1, missing2} more."
        bib = ""  # empty bib
        result = self._run_cite_stripping(tex, bib)
        assert r"\cite" not in result, f"Expected no \\cite, got: {result!r}"
        print(f"[IMP-14] PASS: all missing → {result!r}")

    def test_all_keys_valid(self):
        """When all keys are valid, tex should remain unchanged (except whitespace)."""
        tex = r"Text \cite{key1, key2} end."
        bib = textwrap.dedent("""\
            @article{key1,
              author = {A},
              title = {T},
              year = {2024},
            }

            @article{key2,
              author = {B},
              title = {T2},
              year = {2024},
            }
        """)
        result = self._run_cite_stripping(tex, bib)
        assert r"\cite{key1, key2}" in result, f"Expected unchanged, got: {result!r}"
        print(f"[IMP-14] PASS: all valid → {result!r}")

    def test_multiple_cite_commands(self):
        """Multiple \\cite commands, each with different missing keys."""
        tex = (
            r"First \cite{a, b} second \cite{b, c} third \cite{d}."
        )
        bib = textwrap.dedent("""\
            @article{a,
              author = {X},
              title = {Y},
              year = {2024},
            }

            @article{c,
              author = {X},
              title = {Y},
              year = {2024},
            }
        """)
        result = self._run_cite_stripping(tex, bib)
        # a is valid, b is missing, c is valid, d is missing
        assert r"\cite{a}" in result, f"Expected \\cite{{a}}, got: {result!r}"
        assert r"\cite{c}" in result, f"Expected \\cite{{c}}, got: {result!r}"
        # b should not appear as a cite key
        assert r"\cite{b}" not in result, f"\\cite{{b}} should be gone: {result!r}"
        assert r", b}" not in result and r"{b," not in result, \
            f"b key should be stripped: {result!r}"
        # \cite{d} should be entirely removed (d was the only key)
        assert r"\cite{d}" not in result, f"\\cite{{d}} should be gone: {result!r}"
        print(f"[IMP-14] PASS: multiple cites → {result!r}")

    def test_whitespace_cleanup(self):
        """After removing a full \\cite{}, leftover double-spaces and ' .' are cleaned."""
        tex = r"Text \cite{missing} end."
        bib = ""
        result = self._run_cite_stripping(tex, bib)
        # Should not have double spaces or " ."
        assert "  " not in result, f"Double space in result: {result!r}"
        assert " ." not in result, f"Space-dot in result: {result!r}"
        print(f"[IMP-14] PASS: whitespace cleanup → {result!r}")


# ============================================================
# IMP-15: Test BibTeX deduplication
# ============================================================

class TestIMP15_BibDedup:
    """IMP-15: Deduplicate .bib entries sharing the same cite key."""

    @staticmethod
    def _run_dedup(bib_text: str) -> str:
        """Reproduce IMP-15 logic from runner.py lines 486-503."""
        _seen_bib_keys: set[str] = set()
        _deduped_entries: list[str] = []
        for _bm in re.finditer(
            r"(@\w+\{([^,]+),.*?\n\})", bib_text, re.DOTALL
        ):
            _bkey = _bm.group(2).strip()
            if _bkey not in _seen_bib_keys:
                _seen_bib_keys.add(_bkey)
                _deduped_entries.append(_bm.group(1))
        if len(_deduped_entries) < len(
            list(re.finditer(r"@\w+\{", bib_text))
        ):
            bib_text = "\n\n".join(_deduped_entries) + "\n"
        return bib_text

    def test_duplicate_entries_removed(self):
        bib = textwrap.dedent("""\
            @article{smith2024,
              author = {Smith},
              title = {Paper 1},
              year = {2024},
            }

            @article{smith2024,
              author = {Smith},
              title = {Paper 1 duplicate},
              year = {2024},
            }

            @article{jones2023,
              author = {Jones},
              title = {Paper 2},
              year = {2023},
            }
        """)
        result = self._run_dedup(bib)
        # Count how many @article{smith2024, appear
        count_smith = len(re.findall(r"@article\{smith2024,", result))
        count_jones = len(re.findall(r"@article\{jones2023,", result))
        assert count_smith == 1, f"Expected 1 smith2024 entry, got {count_smith}"
        assert count_jones == 1, f"Expected 1 jones2023 entry, got {count_jones}"
        # First version should be kept
        assert "Paper 1" in result
        print(f"[IMP-15] PASS: 2 smith2024 → 1, jones2023 kept. Total entries correct.")

    def test_no_duplicates_unchanged(self):
        bib = textwrap.dedent("""\
            @article{alpha2024,
              author = {Alpha},
              title = {A},
              year = {2024},
            }

            @inproceedings{beta2023,
              author = {Beta},
              title = {B},
              year = {2023},
            }
        """)
        result = self._run_dedup(bib)
        # Should remain unchanged (both entries present)
        assert "alpha2024" in result
        assert "beta2023" in result
        count = len(re.findall(r"@\w+\{", result))
        assert count == 2, f"Expected 2 entries, got {count}"
        print(f"[IMP-15] PASS: no duplicates → unchanged")

    def test_triple_duplicate(self):
        bib = textwrap.dedent("""\
            @article{x2024,
              author = {X},
              title = {First},
              year = {2024},
            }

            @article{x2024,
              author = {X},
              title = {Second},
              year = {2024},
            }

            @article{x2024,
              author = {X},
              title = {Third},
              year = {2024},
            }
        """)
        result = self._run_dedup(bib)
        count = len(re.findall(r"@article\{x2024,", result))
        assert count == 1, f"Expected 1 x2024 entry, got {count}"
        # First version kept
        assert "First" in result
        assert "Second" not in result
        assert "Third" not in result
        print(f"[IMP-15] PASS: triple duplicate → 1 entry")

    def test_empty_bib(self):
        """Edge case: empty bib text should not crash."""
        bib = ""
        result = self._run_dedup(bib)
        assert result == "", f"Expected empty, got: {result!r}"
        print(f"[IMP-15] PASS: empty bib → no crash")


# ============================================================
# IMP-16: Test bootstrap CI fallback
# ============================================================

class TestIMP16_BootstrapCIFallback:
    """IMP-16: If bootstrap CI does not contain the mean,
    fall back to normal approximation (mean +/- 1.96*SE)."""

    @staticmethod
    def _compute_ci_with_fallback(vals: list[float]) -> tuple[float, float, bool]:
        """Reproduce IMP-16 logic from executor.py lines 3367-3397.
        Returns (ci_low, ci_high, used_fallback)."""
        _mean = statistics.mean(vals)
        _std = statistics.stdev(vals)

        # Bootstrap 95% CI
        _rng = random.Random(42)
        _boot_means = []
        for _ in range(1000):
            _sample = [_rng.choice(vals) for _ in range(len(vals))]
            _boot_means.append(statistics.mean(_sample))
        _boot_means.sort()
        _ci_low = round(_boot_means[int(0.025 * len(_boot_means))], 6)
        _ci_high = round(_boot_means[int(0.975 * len(_boot_means))], 6)

        # IMP-16: Sanity check
        used_fallback = False
        if _ci_low > _mean or _ci_high < _mean:
            _se = _std / (len(vals) ** 0.5)
            _ci_low = round(_mean - 1.96 * _se, 6)
            _ci_high = round(_mean + 1.96 * _se, 6)
            used_fallback = True

        return _ci_low, _ci_high, used_fallback

    def test_normal_case_no_fallback(self):
        """Normal data: bootstrap CI should contain the mean, no fallback needed."""
        vals = [0.8, 0.82, 0.79, 0.81, 0.83]
        ci_low, ci_high, used_fallback = self._compute_ci_with_fallback(vals)
        mean = statistics.mean(vals)
        assert ci_low <= mean <= ci_high, \
            f"CI [{ci_low}, {ci_high}] should contain mean {mean}"
        assert not used_fallback, "Should NOT have used fallback for normal data"
        print(f"[IMP-16] PASS: normal data → CI=[{ci_low}, {ci_high}], mean={mean:.4f}, no fallback")

    def test_fallback_triggers_for_pathological_data(self):
        """Construct data where bootstrap CI might not contain the mean.

        This tests the fallback logic path itself. We directly test the
        condition and fallback formula rather than relying on pathological
        data generation (which is inherently fragile).
        """
        # Directly test the fallback formula
        vals = [1.0, 2.0, 3.0, 4.0, 5.0]
        mean = statistics.mean(vals)
        std = statistics.stdev(vals)
        se = std / (len(vals) ** 0.5)

        # Simulate a bad CI that doesn't contain the mean
        bad_ci_low = mean + 0.1  # Above mean - CI doesn't contain mean
        bad_ci_high = mean + 1.0

        # Apply fallback logic
        assert bad_ci_low > mean, "Bad CI should not contain mean"
        fallback_low = round(mean - 1.96 * se, 6)
        fallback_high = round(mean + 1.96 * se, 6)

        assert fallback_low <= mean <= fallback_high, \
            f"Fallback CI [{fallback_low}, {fallback_high}] must contain mean {mean}"
        print(f"[IMP-16] PASS: fallback CI=[{fallback_low}, {fallback_high}], mean={mean:.4f}")

    def test_fallback_ci_always_contains_mean(self):
        """The normal-approximation fallback MUST always contain the mean."""
        test_cases = [
            [10, 20, 30],
            [0.001, 0.002, 0.003, 0.004],
            [100, 200, 300, 400, 500],
            [-5, -3, -1, 1, 3, 5],
        ]
        for vals in test_cases:
            mean = statistics.mean(vals)
            std = statistics.stdev(vals)
            se = std / (len(vals) ** 0.5)
            ci_low = round(mean - 1.96 * se, 6)
            ci_high = round(mean + 1.96 * se, 6)
            assert ci_low <= mean <= ci_high, \
                f"Fallback CI [{ci_low}, {ci_high}] must contain mean {mean} for vals={vals}"
        print(f"[IMP-16] PASS: fallback always contains mean for {len(test_cases)} test cases")

    def test_condition_check_logic(self):
        """Verify the condition `_ci_low > _mean or _ci_high < _mean` is correct.

        The condition should detect when the mean is OUTSIDE the CI."""
        mean = 5.0
        # Case 1: Mean below CI
        assert (6.0 > mean or 8.0 < mean) == True, "Mean below CI not detected"
        # Case 2: Mean above CI
        assert (1.0 > mean or 4.0 < mean) == True, "Mean above CI not detected"
        # Case 3: Mean inside CI
        assert (3.0 > mean or 7.0 < mean) == False, "Mean inside CI incorrectly flagged"
        # Case 4: Mean equals boundary
        assert (5.0 > mean or 7.0 < mean) == False, "Mean at lower boundary incorrectly flagged"
        assert (3.0 > mean or 5.0 < mean) == False, "Mean at upper boundary incorrectly flagged"
        print("[IMP-16] PASS: condition check logic correct for all cases")

    def test_min_sample_size(self):
        """The code requires len(vals) >= 3 for bootstrap. Verify with exactly 3."""
        vals = [1.0, 2.0, 3.0]
        ci_low, ci_high, _ = self._compute_ci_with_fallback(vals)
        mean = statistics.mean(vals)
        assert ci_low <= mean <= ci_high, \
            f"CI [{ci_low}, {ci_high}] should contain mean {mean} for n=3"
        print(f"[IMP-16] PASS: n=3 works → CI=[{ci_low}, {ci_high}], mean={mean:.4f}")


# ============================================================
# Integration-style: Test the runner.py _package_deliverables
# cite-stripping + dedup pipeline end-to-end
# ============================================================

class TestIMP14_15_Integration:
    """End-to-end test: dedup + cite stripping on a realistic scenario."""

    def test_dedup_then_strip(self):
        """Run dedup (IMP-15) then cite-strip (IMP-14) in sequence, as runner.py does."""
        bib_text = textwrap.dedent("""\
            @article{smith2024,
              author = {Smith},
              title = {Paper A},
              year = {2024},
            }

            @article{smith2024,
              author = {Smith},
              title = {Paper A dup},
              year = {2024},
            }

            @article{jones2023,
              author = {Jones},
              title = {Paper B},
              year = {2023},
            }
        """)
        tex_text = r"Results from \cite{smith2024, jones2023, ghost2024} show..."

        # Step 1: IMP-15 dedup
        _seen: set[str] = set()
        _deduped: list[str] = []
        for m in re.finditer(r"(@\w+\{([^,]+),.*?\n\})", bib_text, re.DOTALL):
            k = m.group(2).strip()
            if k not in _seen:
                _seen.add(k)
                _deduped.append(m.group(1))
        if len(_deduped) < len(list(re.finditer(r"@\w+\{", bib_text))):
            bib_text = "\n\n".join(_deduped) + "\n"

        # Verify dedup
        assert bib_text.count("smith2024") == 1, "Dedup failed for smith2024"

        # Step 2: IMP-14 cite stripping
        all_cite_keys: set[str] = set()
        for cm in re.finditer(r"\\cite\{([^}]+)\}", tex_text):
            all_cite_keys.update(k.strip() for k in cm.group(1).split(","))
        bib_keys = set(re.findall(r"@\w+\{([^,]+),", bib_text))
        missing = all_cite_keys - bib_keys

        assert missing == {"ghost2024"}, f"Expected only ghost2024 missing, got {missing}"

        def _filter_cite(m: re.Match[str]) -> str:
            keys = [k.strip() for k in m.group(1).split(",")]
            kept = [k for k in keys if k not in missing]
            if not kept:
                return ""
            return "\\cite{" + ", ".join(kept) + "}"

        tex_text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text)
        tex_text = re.sub(r"  +", " ", tex_text)
        tex_text = re.sub(r" ([.,;:)])", r"\1", tex_text)

        assert r"\cite{smith2024, jones2023}" in tex_text, \
            f"Expected valid keys kept, got: {tex_text!r}"
        assert "ghost2024" not in tex_text, \
            f"ghost2024 should be stripped: {tex_text!r}"
        print(f"[Integration] PASS: dedup + cite strip → {tex_text!r}")


# ============================================================
# Runner
# ============================================================

def run_all_tests():
    """Run all tests manually (fallback if pytest not available)."""
    test_classes = [
        TestIMP13_ExtractPaperTitle,
        TestIMP14_StripOrphanedCites,
        TestIMP15_BibDedup,
        TestIMP16_BootstrapCIFallback,
        TestIMP14_15_Integration,
    ]
    total = 0
    passed = 0
    failed = 0
    errors: list[str] = []

    for cls in test_classes:
        instance = cls()
        test_methods = [m for m in dir(instance) if m.startswith("test_")]
        for method_name in sorted(test_methods):
            total += 1
            method = getattr(instance, method_name)
            try:
                method()
                passed += 1
            except Exception as e:
                failed += 1
                err_msg = f"FAIL: {cls.__name__}.{method_name}: {e}"
                errors.append(err_msg)
                print(f"  FAIL: {err_msg}")

    print(f"\n{'='*60}")
    print(f"Results: {passed}/{total} passed, {failed} failed")
    if errors:
        print("Failures:")
        for e in errors:
            print(f"  - {e}")
    print(f"{'='*60}")
    return failed == 0


if __name__ == "__main__":
    # Add project root to path
    project_root = Path(__file__).resolve().parent.parent
    if str(project_root) not in sys.path:
        sys.path.insert(0, str(project_root))

    success = run_all_tests()
    sys.exit(0 if success else 1)


================================================
FILE: tests/test_verified_registry.py
================================================
"""Tests for VerifiedRegistry — ground truth number whitelist."""

from __future__ import annotations

import json
import math
from pathlib import Path

import pytest

from researchclaw.pipeline.verified_registry import (
    ConditionResult,
    VerifiedRegistry,
    _is_finite,
)

# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts"


def _load_experiment_summary(run_id: str) -> dict:
    """Load experiment_summary.json for a given run."""
    pattern = f"rc-*-{run_id}"
    matches = sorted(ARTIFACTS.glob(pattern))
    if not matches:
        pytest.skip(f"Artifact {run_id} not found")
    summary_path = matches[0] / "stage-14" / "experiment_summary.json"
    if not summary_path.exists():
        pytest.skip(f"No experiment_summary for {run_id}")
    return json.loads(summary_path.read_text())


def _load_refinement_log(run_id: str) -> dict | None:
    pattern = f"rc-*-{run_id}"
    matches = sorted(ARTIFACTS.glob(pattern))
    if not matches:
        return None
    log_path = matches[0] / "stage-13" / "refinement_log.json"
    if not log_path.exists():
        return None
    return json.loads(log_path.read_text())


# ---------------------------------------------------------------------------
# Unit tests — ConditionResult
# ---------------------------------------------------------------------------


class TestConditionResult:
    def test_compute_stats_multiple_seeds(self):
        cr = ConditionResult(name="test", per_seed_values={0: 10.0, 1: 20.0, 2: 30.0})
        cr.compute_stats()
        assert cr.n_seeds == 3
        assert cr.mean == pytest.approx(20.0)
        assert cr.std == pytest.approx(10.0)

    def test_compute_stats_single_seed(self):
        cr = ConditionResult(name="test", per_seed_values={0: 42.0})
        cr.compute_stats()
        assert cr.n_seeds == 1
        assert cr.mean == pytest.approx(42.0)
        assert cr.std == 0.0

    def test_compute_stats_with_nan(self):
        cr = ConditionResult(
            name="test", per_seed_values={0: 10.0, 1: float("nan"), 2: 30.0}
        )
        cr.compute_stats()
        assert cr.n_seeds == 2  # NaN excluded
        assert cr.mean == pytest.approx(20.0)

    def test_compute_stats_empty(self):
        cr = ConditionResult(name="test")
        cr.compute_stats()
        assert cr.n_seeds == 0
        assert cr.mean is None


# ---------------------------------------------------------------------------
# Unit tests — VerifiedRegistry core operations
# ---------------------------------------------------------------------------


class TestVerifiedRegistryCore:
    def test_add_value(self):
        reg = VerifiedRegistry()
        reg.add_value(74.28, "test_source")
        assert reg.is_verified(74.28)
        # Rounding variant
        assert reg.is_verified(74.3, tolerance=0.01)

    def test_percentage_conversion(self):
        """Value in [0,1] should also register value*100."""
        reg = VerifiedRegistry()
        reg.add_value(0.7428, "accuracy_fraction")
        assert reg.is_verified(0.7428)
        assert reg.is_verified(74.28)  # ×100 variant

    def test_reverse_percentage(self):
        """Value > 1 should also register value/100."""
        reg = VerifiedRegistry()
        reg.add_value(74.28, "accuracy_percent")
        assert reg.is_verified(74.28)
        assert reg.is_verified(0.7428)  # ÷100 variant

    def test_tolerance_matching(self):
        reg = VerifiedRegistry()
        reg.add_value(92.14, "test")
        # Within 1% tolerance
        assert reg.is_verified(92.14)
        assert reg.is_verified(92.0, tolerance=0.01)  # 0.15% off
        # Outside tolerance
        assert not reg.is_verified(95.0, tolerance=0.01)

    def test_zero_handling(self):
        reg = VerifiedRegistry()
        reg.add_value(0.0, "zero_metric")
        assert reg.is_verified(0.0)
        assert reg.is_verified(1e-8)  # Very close to zero
        assert not reg.is_verified(0.01)  # Not close enough

    def test_negative_values(self):
        reg = VerifiedRegistry()
        reg.add_value(-459.6, "bad_return")
        assert reg.is_verified(-459.6)
        assert reg.is_verified(-460.0, tolerance=0.01)

    def test_nan_inf_rejected(self):
        reg = VerifiedRegistry()
        reg.add_value(float("nan"), "nan_metric")
        reg.add_value(float("inf"), "inf_metric")
        assert not reg.is_verified(float("nan"))
        assert not reg.is_verified(float("inf"))
        assert len(reg.values) == 0

    def test_lookup(self):
        reg = VerifiedRegistry()
        reg.add_value(42.0, "the_answer")
        assert reg.lookup(42.0) == "the_answer"
        assert reg.lookup(999.0) is None

    def test_verify_condition(self):
        reg = VerifiedRegistry()
        reg.condition_names = {"DQN", "DQN+Abstraction"}
        assert reg.verify_condition("DQN")
        assert not reg.verify_condition("PPO")


# ---------------------------------------------------------------------------
# Unit tests — from_experiment (synthetic data)
# ---------------------------------------------------------------------------


class TestFromExperiment:
    def _make_summary(self) -> dict:
        return {
            "metrics_summary": {
                "CondA/0/metric": {"min": 80.0, "max": 80.0, "mean": 80.0, "count": 1},
                "CondA/1/metric": {"min": 85.0, "max": 85.0, "mean": 85.0, "count": 1},
                "CondB/0/metric": {"min": 70.0, "max": 70.0, "mean": 70.0, "count": 1},
                "primary_metric": {"min": 82.5, "max": 82.5, "mean": 82.5, "count": 1},
            },
            "best_run": {
                "metrics": {
                    "CondA/0/metric": 80.0,
                    "CondA/1/metric": 85.0,
                    "CondB/0/metric": 70.0,
                    "primary_metric": 82.5,
                    "primary_metric_std": 3.5355,
                    "total_elapsed_seconds": 1500.0,
                },
            },
            "condition_summaries": {
                "CondA": {"metrics": {"metric": 82.5}},
                "CondB": {"metrics": {"metric": 70.0}},
            },
            "condition_metrics": {
                "CondA": {"metrics": {"metric": 82.5}},
                "CondB": {"metrics": {"metric": 70.0}},
            },
            "total_conditions": 2,
        }

    def test_conditions_extracted(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        assert "CondA" in reg.condition_names
        assert "CondB" in reg.condition_names
        assert len(reg.condition_names) == 2

    def test_per_seed_values(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        assert reg.conditions["CondA"].per_seed_values == {0: 80.0, 1: 85.0}
        assert reg.conditions["CondB"].per_seed_values == {0: 70.0}

    def test_condition_stats(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        cond_a = reg.conditions["CondA"]
        assert cond_a.n_seeds == 2
        assert cond_a.mean == pytest.approx(82.5)
        assert cond_a.std == pytest.approx(3.5355, rel=0.01)

    def test_primary_metric(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        assert reg.primary_metric == pytest.approx(82.5)
        assert reg.primary_metric_std == pytest.approx(3.5355)

    def test_all_values_registered(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        # Core values must be verified
        assert reg.is_verified(80.0)
        assert reg.is_verified(85.0)
        assert reg.is_verified(70.0)
        assert reg.is_verified(82.5)
        assert reg.is_verified(3.5355, tolerance=0.01)

    def test_pairwise_differences(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        diff = 82.5 - 70.0  # CondA.mean - CondB.mean
        assert reg.is_verified(diff)
        assert reg.is_verified(abs(diff))

    def test_fabricated_number_rejected(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        assert not reg.is_verified(99.99)
        assert not reg.is_verified(60.51)

    def test_infra_keys_excluded(self):
        reg = VerifiedRegistry.from_experiment(self._make_summary())
        # total_elapsed_seconds goes to training_config, not values
        assert 1500.0 not in reg.values
        assert reg.training_config.get("total_elapsed_seconds") == 1500.0

    def test_with_refinement_log(self):
        summary = self._make_summary()
        ref_log = {
            "best_metric": 82.5,
            "best_version": "experiment_v1/",
            "iterations": [
                {
                    "version_dir": "experiment_v1/",
                    "metric": 82.5,
                    "sandbox": {"metrics": {"CondA/0/metric": 80.0}},
                }
            ],
        }
        reg = VerifiedRegistry.from_experiment(summary, ref_log)
        assert reg.is_verified(82.5)


# ---------------------------------------------------------------------------
# Integration tests — real artifact data
# ---------------------------------------------------------------------------


class TestRealArtifacts:
    """Test against actual pipeline output.  Skipped if artifacts not present."""

    def test_run_e57360_rl_exploration(self):
        """Run 38 (RL LACE) — 3 conditions, CartPole + Acrobot."""
        summary = _load_experiment_summary("e57360")
        ref_log = _load_refinement_log("e57360")
        reg = VerifiedRegistry.from_experiment(summary, ref_log)

        # Conditions that actually ran
        assert reg.verify_condition("DQN")
        assert reg.verify_condition("DQN+Abstraction")
        assert reg.verify_condition("DQN+RawCount")

        # Conditions that did NOT run (paper fabricated these)
        assert not reg.verify_condition("PPO")
        assert not reg.verify_condition("PPO+Abstraction")
        assert not reg.verify_condition("DQN+Autoencoder")

        # Real primary metric
        assert reg.is_verified(278.9333)
        assert reg.is_verified(146.4139, tolerance=0.01)

        # Fabricated number from paper (0.0 primary metric) — should NOT verify
        # unless 0.0 happens to be in the data for another reason
        # The paper claimed primary_metric=0.0 which is fabricated
        assert reg.primary_metric == pytest.approx(278.9333)

    def test_run_acbdfa_cnn_vs_ssm(self):
        """Run acbdfa (CTS) — ResNet vs S4D on CIFAR-100."""
        summary = _load_experiment_summary("acbdfa")
        reg = VerifiedRegistry.from_experiment(summary)

        # Real values from experiment
        assert reg.is_verified(69.99)
        assert reg.is_verified(69.93)
        assert reg.is_verified(58.66)
        assert reg.is_verified(2.75)

        # Primary metric
        assert reg.is_verified(66.1933, tolerance=0.01)

    def test_run_85fefc_contrastive_kd(self):
        """Run 85fefc (CRAFT) — contrastive KD."""
        summary = _load_experiment_summary("85fefc")
        ref_log = _load_refinement_log("85fefc")
        reg = VerifiedRegistry.from_experiment(summary, ref_log)

        # Should have conditions
        assert len(reg.condition_names) > 0

        # Primary metric should be registered
        assert reg.primary_metric is not None

    def test_run_8b4a1b_gard_lora(self):
        """Run 8b4a1b (GARD) — experiment failed, very few values."""
        summary = _load_experiment_summary("8b4a1b")
        reg = VerifiedRegistry.from_experiment(summary)

        # With empty metrics, registry should be sparse
        best_metrics = summary.get("best_run", {}).get("metrics", {})
        if not best_metrics:
            assert len(reg.values) == 0


# ---------------------------------------------------------------------------
# Unit tests — from_run_dir (merges multiple sources)
# ---------------------------------------------------------------------------


class TestFromRunDir:
    def _write_summary(self, path: Path, data: dict) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(data, indent=2), encoding="utf-8")

    def test_from_run_dir_merges_multiple_stage14(self, tmp_path: Path) -> None:
        """Two stage-14 dirs with different values → both present."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        # Stage-14 with CondA
        self._write_summary(
            run_dir / "stage-14" / "experiment_summary.json",
            {
                "best_run": {"metrics": {"CondA/0/metric": 80.0}},
                "condition_summaries": {"CondA": {"metrics": {"metric": 80.0}}},
                "metrics_summary": {},
            },
        )
        # Stage-14-v2 with CondB
        self._write_summary(
            run_dir / "stage-14-v2" / "experiment_summary.json",
            {
                "best_run": {"metrics": {"CondB/0/metric": 90.0}},
                "condition_summaries": {"CondB": {"metrics": {"metric": 90.0}}},
                "metrics_summary": {},
            },
        )
        reg = VerifiedRegistry.from_run_dir(run_dir)
        assert "CondA" in reg.condition_names
        assert "CondB" in reg.condition_names
        assert reg.is_verified(80.0)
        assert reg.is_verified(90.0)

    def test_from_run_dir_includes_best(self, tmp_path: Path) -> None:
        """experiment_summary_best.json values merged."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        # Only best summary at root level
        self._write_summary(
            run_dir / "experiment_summary_best.json",
            {
                "best_run": {"metrics": {"primary_metric": 0.95}},
                "condition_summaries": {"Proposed": {"metrics": {"acc": 0.95}}},
                "metrics_summary": {"acc": {"mean": 0.95, "min": 0.95, "max": 0.95}},
            },
        )
        reg = VerifiedRegistry.from_run_dir(run_dir)
        assert reg.is_verified(0.95)
        assert reg.is_verified(95.0)  # percentage variant
        assert "Proposed" in reg.condition_names

    def test_from_run_dir_empty_dir(self, tmp_path: Path) -> None:
        """Empty run dir → empty registry, no crash."""
        run_dir = tmp_path / "empty_run"
        run_dir.mkdir()
        reg = VerifiedRegistry.from_run_dir(run_dir)
        assert len(reg.values) == 0
        assert len(reg.condition_names) == 0

    # -----------------------------------------------------------------------
    # BUG-222: best_only mode — REFINE bypass prevention
    # -----------------------------------------------------------------------

    def test_best_only_uses_experiment_summary_best(self, tmp_path: Path) -> None:
        """best_only=True should use ONLY experiment_summary_best.json."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        # v1 (best): FeatureKD 74.52%
        self._write_summary(
            run_dir / "experiment_summary_best.json",
            {
                "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}},
                "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}},
                "metrics_summary": {"metric": {"mean": 0.7452}},
            },
        )
        # v3 (regressed): FeatureKD 69.30%
        self._write_summary(
            run_dir / "stage-14" / "experiment_summary.json",
            {
                "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}},
                "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.6930}}},
                "metrics_summary": {"metric": {"mean": 0.6930}},
            },
        )

        reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True)
        # Should ONLY have v1 (best) data
        assert reg.is_verified(0.7452)
        assert reg.is_verified(74.52)  # percentage variant
        # Should NOT have v3 (regressed) data
        assert not reg.is_verified(0.6930)
        assert not reg.is_verified(69.30)

    def test_best_only_excludes_refinement_log(self, tmp_path: Path) -> None:
        """best_only=True should NOT merge refinement_log.json sandbox data."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        # Best summary
        self._write_summary(
            run_dir / "experiment_summary_best.json",
            {
                "best_run": {"metrics": {"primary_metric": 0.7452}},
                "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}},
                "metrics_summary": {"metric": {"mean": 0.7452}},
            },
        )
        # Refinement log with sandbox metrics from regressed iteration
        rl_dir = run_dir / "stage-13"
        rl_dir.mkdir(parents=True)
        (rl_dir / "refinement_log.json").write_text(json.dumps({
            "iterations": [
                {"sandbox": {"metrics": {"primary_metric": 0.6930, "best_metric": 0.6930}}}
            ]
        }), encoding="utf-8")

        reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True)
        assert reg.is_verified(0.7452)
        assert not reg.is_verified(0.6930), "Refinement log sandbox values should NOT be in best_only registry"

    def test_best_only_falls_back_to_stage14(self, tmp_path: Path) -> None:
        """best_only=True without best.json falls back to stage-14/ (non-versioned)."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        self._write_summary(
            run_dir / "stage-14" / "experiment_summary.json",
            {
                "best_run": {"metrics": {"metric": 0.85}},
                "condition_summaries": {"Baseline": {"metrics": {"metric": 0.85}}},
                "metrics_summary": {"metric": {"mean": 0.85}},
            },
        )
        reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True)
        assert reg.is_verified(0.85)
        assert "Baseline" in reg.condition_names

    def test_default_mode_still_merges_all(self, tmp_path: Path) -> None:
        """Default (best_only=False) preserves backward-compat merging."""
        run_dir = tmp_path / "run"
        run_dir.mkdir()
        self._write_summary(
            run_dir / "experiment_summary_best.json",
            {
                "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}},
                "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}},
                "metrics_summary": {},
            },
        )
        self._write_summary(
            run_dir / "stage-14" / "experiment_summary.json",
            {
                "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}},
                "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.6930}}},
                "metrics_summary": {},
            },
        )
        reg = VerifiedRegistry.from_run_dir(run_dir, best_only=False)
        # Both should be present in non-best_only mode
        assert reg.is_verified(0.7452)
        assert reg.is_verified(0.6930)


================================================
FILE: tests/test_web_crawler.py
================================================
"""Tests for researchclaw.web.crawler — WebCrawler."""

from __future__ import annotations

import asyncio
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.web.crawler import CrawlResult, WebCrawler
from researchclaw.web import check_url_ssrf


# ---------------------------------------------------------------------------
# CrawlResult dataclass
# ---------------------------------------------------------------------------


class TestCrawlResult:
    def test_has_content_true(self):
        r = CrawlResult(url="https://example.com", markdown="x" * 100, success=True)
        assert r.has_content

    def test_has_content_false_empty(self):
        r = CrawlResult(url="https://example.com", markdown="", success=True)
        assert not r.has_content

    def test_has_content_false_short(self):
        r = CrawlResult(url="https://example.com", markdown="too short", success=True)
        assert not r.has_content


# ---------------------------------------------------------------------------
# HTML → Markdown conversion (urllib fallback)
# ---------------------------------------------------------------------------


class TestHtmlToMarkdown:
    def test_strips_script_tags(self):
        html = "<p>Hello</p><script>alert(1)</script><p>World</p>"
        md = WebCrawler._html_to_markdown(html)
        assert "alert" not in md
        assert "Hello" in md
        assert "World" in md

    def test_converts_headings(self):
        html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>"
        md = WebCrawler._html_to_markdown(html)
        assert "# Title" in md
        assert "## Subtitle" in md
        assert "### Section" in md

    def test_converts_paragraphs(self):
        html = "<p>First paragraph.</p><p>Second paragraph.</p>"
        md = WebCrawler._html_to_markdown(html)
        assert "First paragraph." in md
        assert "Second paragraph." in md

    def test_converts_links(self):
        html = '<a href="https://example.com">Click</a>'
        md = WebCrawler._html_to_markdown(html)
        assert "[Click](https://example.com)" in md

    def test_converts_list_items(self):
        html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
        md = WebCrawler._html_to_markdown(html)
        assert "- Item 1" in md
        assert "- Item 2" in md

    def test_decodes_entities(self):
        html = "<p>A &amp; B &lt; C &gt; D</p>"
        md = WebCrawler._html_to_markdown(html)
        assert "A & B < C > D" in md

    def test_collapses_whitespace(self):
        html = "<p>Hello</p>\n\n\n\n<p>World</p>"
        md = WebCrawler._html_to_markdown(html)
        assert "\n\n\n" not in md


# ---------------------------------------------------------------------------
# urllib fallback crawl
# ---------------------------------------------------------------------------


class TestCrawlUrllibFallback:
    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_urllib_success(self, mock_urlopen):
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"<html><title>Test</title><body><p>Content here</p></body></html>"
        mock_resp.headers = {"Content-Type": "text/html; charset=utf-8"}
        mock_urlopen.return_value = mock_resp

        crawler = WebCrawler()
        import time
        t0 = time.monotonic()
        result = crawler._crawl_with_urllib("https://example.com", t0)
        assert result.success
        assert result.title == "Test"
        assert "Content here" in result.markdown

    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_urllib_truncation(self, mock_urlopen):
        mock_resp = MagicMock()
        long_content = "<p>" + "x" * 60000 + "</p>"
        mock_resp.read.return_value = long_content.encode()
        mock_resp.headers = {"Content-Type": "text/html"}
        mock_urlopen.return_value = mock_resp

        crawler = WebCrawler(max_content_length=1000)
        import time
        t0 = time.monotonic()
        result = crawler._crawl_with_urllib("https://example.com", t0)
        assert len(result.markdown) <= 1100  # 1000 + truncation notice


# ---------------------------------------------------------------------------
# Sync crawl (goes through crawl4ai → urllib fallback chain)
# ---------------------------------------------------------------------------


class TestCrawlSync:
    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_sync_falls_back_to_urllib(self, mock_urlopen):
        """crawl_sync tries crawl4ai, then falls back to urllib."""
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"<html><title>Sync</title><body><p>Works via urllib</p></body></html>"
        mock_resp.headers = {"Content-Type": "text/html"}
        mock_urlopen.return_value = mock_resp

        crawler = WebCrawler()
        # Crawl4AI may or may not work in test env (no browser),
        # but urllib fallback should always work
        result = crawler.crawl_sync("https://example.com")
        assert result.success or result.error  # either crawl4ai or urllib


# ---------------------------------------------------------------------------
# Async crawl
# ---------------------------------------------------------------------------


class TestCrawlAsync:
    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_async_urllib_fallback(self, mock_urlopen):
        """When crawl4ai's browser isn't set up, async crawl falls back to urllib."""
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"<html><title>Async</title><body><p>Works</p></body></html>"
        mock_resp.headers = {"Content-Type": "text/html"}
        mock_urlopen.return_value = mock_resp

        crawler = WebCrawler()
        result = asyncio.run(crawler.crawl("https://example.com"))
        # Should succeed via either crawl4ai or urllib fallback
        assert isinstance(result, CrawlResult)


# ---------------------------------------------------------------------------
# SSRF validation: check_url_ssrf
# ---------------------------------------------------------------------------


class TestCheckUrlSsrf:
    def test_http_allowed(self):
        assert check_url_ssrf("http://example.com") is None

    def test_https_allowed(self):
        assert check_url_ssrf("https://arxiv.org/abs/2301.00001") is None

    def test_rejects_file_scheme(self):
        err = check_url_ssrf("file:///etc/passwd")
        assert err is not None
        assert "scheme" in err.lower()

    def test_rejects_ftp_scheme(self):
        err = check_url_ssrf("ftp://server/file")
        assert err is not None

    def test_rejects_localhost(self):
        err = check_url_ssrf("http://localhost:8080")
        assert err is not None
        assert "internal" in err.lower() or "private" in err.lower() or "blocked" in err.lower()

    def test_rejects_127(self):
        err = check_url_ssrf("http://127.0.0.1:6379")
        assert err is not None

    def test_rejects_10_range(self):
        err = check_url_ssrf("http://10.0.0.1")
        assert err is not None

    def test_rejects_172_range(self):
        err = check_url_ssrf("http://172.16.0.1")
        assert err is not None

    def test_rejects_192_range(self):
        err = check_url_ssrf("http://192.168.1.1")
        assert err is not None

    def test_rejects_aws_metadata(self):
        err = check_url_ssrf("http://169.254.169.254/latest/meta-data")
        assert err is not None

    def test_rejects_empty_hostname(self):
        err = check_url_ssrf("http://")
        assert err is not None


# ---------------------------------------------------------------------------
# Crawler SSRF integration
# ---------------------------------------------------------------------------


class TestCrawlerSsrfIntegration:
    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_sync_rejects_private_url(self, mock_urlopen):
        crawler = WebCrawler()
        result = crawler.crawl_sync("http://127.0.0.1:8080")
        assert not result.success
        assert result.error
        mock_urlopen.assert_not_called()

    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_sync_rejects_file_scheme(self, mock_urlopen):
        crawler = WebCrawler()
        result = crawler.crawl_sync("file:///etc/passwd")
        assert not result.success
        assert "scheme" in result.error.lower()
        mock_urlopen.assert_not_called()

    @patch("researchclaw.web.crawler.urlopen")
    def test_crawl_async_rejects_private_url(self, mock_urlopen):
        crawler = WebCrawler()
        result = asyncio.run(crawler.crawl("http://10.0.0.1:9200"))
        assert not result.success
        assert result.error
        mock_urlopen.assert_not_called()


================================================
FILE: tests/test_web_integration.py
================================================
"""Integration tests for researchclaw.web — WebSearchAgent end-to-end."""

from __future__ import annotations

from unittest.mock import MagicMock, patch

import pytest

from researchclaw.web.agent import WebSearchAgent, WebSearchAgentResult
from researchclaw.web.crawler import CrawlResult
from researchclaw.web.search import SearchResult, WebSearchResponse
from researchclaw.web.scholar import ScholarPaper


# ---------------------------------------------------------------------------
# WebSearchAgentResult
# ---------------------------------------------------------------------------


class TestWebSearchAgentResult:
    def test_total_results(self):
        r = WebSearchAgentResult(
            topic="test",
            web_results=[SearchResult(title="A", url="u1")],
            scholar_papers=[ScholarPaper(title="B")],
        )
        assert r.total_results == 2

    def test_to_context_string_empty(self):
        r = WebSearchAgentResult(topic="test")
        ctx = r.to_context_string()
        assert isinstance(ctx, str)

    def test_to_context_string_with_results(self):
        r = WebSearchAgentResult(
            topic="knowledge distillation",
            web_results=[
                SearchResult(
                    title="KD Survey",
                    url="https://example.com/kd",
                    snippet="A comprehensive survey on KD",
                    source="tavily",
                ),
            ],
            scholar_papers=[
                ScholarPaper(
                    title="Distilling Knowledge",
                    authors=["Hinton", "Vinyals", "Dean"],
                    year=2015,
                    citation_count=5000,
                    abstract="We propose a technique for model compression.",
                ),
            ],
            search_answer="KD is a model compression technique.",
        )
        ctx = r.to_context_string()
        assert "AI Search Summary" in ctx
        assert "KD Survey" in ctx
        assert "Distilling Knowledge" in ctx
        assert "Hinton" in ctx

    def test_to_context_string_truncation(self):
        r = WebSearchAgentResult(
            topic="test",
            web_results=[
                SearchResult(title=f"R{i}", url=f"u{i}", snippet="x" * 1000)
                for i in range(50)
            ],
        )
        ctx = r.to_context_string(max_length=5000)
        assert len(ctx) <= 5100

    def test_to_dict(self):
        r = WebSearchAgentResult(
            topic="test",
            web_results=[SearchResult(title="A", url="u1")],
        )
        d = r.to_dict()
        assert d["topic"] == "test"
        assert d["web_results_count"] == 1

    def test_to_context_with_crawled_pages(self):
        r = WebSearchAgentResult(
            topic="test",
            crawled_pages=[
                CrawlResult(
                    url="https://blog.example.com",
                    markdown="# Great Blog Post\n\nContent " * 50,
                    title="Great Blog Post",
                    success=True,
                ),
            ],
        )
        ctx = r.to_context_string()
        assert "Crawled Page Content" in ctx
        assert "Great Blog Post" in ctx


# ---------------------------------------------------------------------------
# WebSearchAgent — orchestration
# ---------------------------------------------------------------------------


class TestWebSearchAgent:
    def test_generate_queries(self):
        queries = WebSearchAgent._generate_queries("knowledge distillation")
        assert len(queries) == 3
        assert "knowledge distillation" in queries
        assert any("survey" in q for q in queries)
        assert any("benchmark" in q for q in queries)

    def test_select_urls_to_crawl(self):
        agent = WebSearchAgent(max_crawl_urls=3)
        result = WebSearchAgentResult(
            topic="test",
            web_results=[
                SearchResult(title=f"R{i}", url=f"https://ex.com/{i}")
                for i in range(10)
            ],
        )
        urls = agent._select_urls_to_crawl(result)
        assert len(urls) <= 3
        assert all(url.startswith("https://") for url in urls)

    def test_select_urls_skips_pdf(self):
        agent = WebSearchAgent(max_crawl_urls=5)
        result = WebSearchAgentResult(
            topic="test",
            web_results=[
                SearchResult(title="Paper", url="https://ex.com/paper.pdf"),
                SearchResult(title="Blog", url="https://ex.com/blog"),
            ],
        )
        urls = agent._select_urls_to_crawl(result)
        assert "https://ex.com/paper.pdf" not in urls
        assert "https://ex.com/blog" in urls

    def test_find_pdf_urls(self):
        result = WebSearchAgentResult(
            topic="test",
            web_results=[
                SearchResult(title="P1", url="https://ex.com/a.pdf"),
                SearchResult(title="P2", url="https://ex.com/b.html"),
                SearchResult(title="P3", url="https://ex.com/c.pdf"),
            ],
        )
        pdfs = WebSearchAgent._find_pdf_urls(result)
        assert len(pdfs) == 2
        assert all(u.endswith(".pdf") for u in pdfs)

    @patch("researchclaw.web.search.urlopen")
    @patch("researchclaw.web.scholar.scholarly")
    def test_search_and_extract_minimal(self, mock_scholarly, mock_urlopen):
        """End-to-end test with mocked HTTP — DuckDuckGo + mocked Scholar."""
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"""
        <a class="result__a" href="https://arxiv.org/abs/1234">Paper About KD</a>
        <a class="result__snippet">A study on knowledge distillation</a>
        """
        mock_urlopen.return_value = mock_resp

        # Mock scholarly to return empty (avoid network calls)
        mock_scholarly.search_pubs.return_value = iter([])

        agent = WebSearchAgent(
            enable_scholar=True,
            enable_crawling=False,
            enable_pdf=False,
        )
        result = agent.search_and_extract("knowledge distillation")
        assert result.topic == "knowledge distillation"
        assert result.elapsed_seconds > 0

    @patch("researchclaw.web.search.urlopen")
    @patch("researchclaw.web.scholar.scholarly")
    @patch("researchclaw.web.crawler.urlopen")
    def test_search_and_extract_with_crawling(self, mock_crawl_urlopen, mock_scholarly, mock_search_urlopen):
        """Test with crawling enabled."""
        mock_search_resp = MagicMock()
        mock_search_resp.read.return_value = b"""
        <a class="result__a" href="https://blog.example.com/kd">KD Tutorial</a>
        <a class="result__snippet">A tutorial</a>
        """
        mock_search_urlopen.return_value = mock_search_resp

        mock_crawl_resp = MagicMock()
        mock_crawl_resp.read.return_value = (
            b"<html><title>KD Tutorial</title><body><p>"
            + b"Tutorial content about knowledge distillation. " * 20
            + b"</p></body></html>"
        )
        mock_crawl_resp.headers = {"Content-Type": "text/html"}
        mock_crawl_urlopen.return_value = mock_crawl_resp

        mock_scholarly.search_pubs.return_value = iter([])

        agent = WebSearchAgent(
            enable_scholar=False,
            enable_crawling=True,
            enable_pdf=False,
            max_crawl_urls=2,
        )
        result = agent.search_and_extract("knowledge distillation")
        assert result.elapsed_seconds > 0


# ---------------------------------------------------------------------------
# Config integration
# ---------------------------------------------------------------------------


class TestWebSearchConfig:
    def test_default_config(self):
        from researchclaw.config import WebSearchConfig
        cfg = WebSearchConfig()
        assert cfg.enabled is True
        assert cfg.max_web_results == 10
        assert cfg.enable_scholar is True

    def test_config_in_rcconfig(self):
        from researchclaw.config import RCConfig
        import dataclasses
        field_names = [f.name for f in dataclasses.fields(RCConfig)]
        assert "web_search" in field_names


================================================
FILE: tests/test_web_pdf_extractor.py
================================================
"""Tests for researchclaw.web.pdf_extractor — PDFExtractor."""

from __future__ import annotations

import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.web.pdf_extractor import PDFContent, PDFExtractor


# ---------------------------------------------------------------------------
# PDFContent dataclass
# ---------------------------------------------------------------------------


class TestPDFContent:
    def test_has_content_true(self):
        c = PDFContent(path="test.pdf", text="x" * 200, success=True)
        assert c.has_content

    def test_has_content_false_empty(self):
        c = PDFContent(path="test.pdf", text="", success=True)
        assert not c.has_content

    def test_has_content_false_short(self):
        c = PDFContent(path="test.pdf", text="short", success=True)
        assert not c.has_content


# ---------------------------------------------------------------------------
# PDFExtractor
# ---------------------------------------------------------------------------


class TestPDFExtractor:
    def test_backend_detection(self):
        extractor = PDFExtractor()
        assert extractor.backend == "pymupdf"  # PyMuPDF is now installed

    def test_extract_nonexistent_file(self, tmp_path):
        extractor = PDFExtractor()
        result = extractor.extract(tmp_path / "does_not_exist.pdf")
        assert not result.success or "not found" in result.error.lower() or result.error

    def test_extract_abstract_pattern(self):
        text = """
Some header text

Abstract
This paper presents a novel approach to knowledge distillation
that achieves state-of-the-art results on ImageNet.

1 Introduction
We begin by motivating our approach...
"""
        abstract = PDFExtractor._extract_abstract(text)
        assert "knowledge distillation" in abstract

    def test_extract_abstract_no_match(self):
        text = "No abstract section here, just random text."
        abstract = PDFExtractor._extract_abstract(text)
        assert abstract == ""

    def test_detect_sections(self):
        text = """
1. Introduction
This is the introduction section with some content.

2. Related Work
This covers prior work in the field.

3. Method
Our proposed approach works as follows.

4. Experiments
We evaluate on several benchmarks.
"""
        sections = PDFExtractor._detect_sections(text)
        assert len(sections) >= 3
        headings = [s["heading"] for s in sections]
        assert any("Introduction" in h for h in headings)
        assert any("Related" in h or "Method" in h for h in headings)

    def test_detect_sections_empty(self):
        text = "No numbered sections here at all."
        sections = PDFExtractor._detect_sections(text)
        assert sections == []

    @patch("researchclaw.web.pdf_extractor.urlopen")
    def test_extract_from_url_failure(self, mock_urlopen):
        mock_urlopen.side_effect = Exception("404 Not Found")
        extractor = PDFExtractor()
        result = extractor.extract_from_url("https://example.com/paper.pdf")
        assert not result.success or result.error


================================================
FILE: tests/test_web_platform.py
================================================
"""Tests for Agent A — Web platform and user interface.

Covers: FastAPI routes, WebSocket, intents, dashboard collector, wizard, voice commands.
All tests run without external services (mocked LLM, mocked Whisper).
"""

from __future__ import annotations

import asyncio
import json
import os
import sys
import tempfile
import time
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

# ---------------------------------------------------------------------------
# Config tests
# ---------------------------------------------------------------------------


class TestServerConfig:
    """Test ServerConfig and DashboardConfig in config.py."""

    def test_server_config_defaults(self) -> None:
        from researchclaw.config import ServerConfig

        cfg = ServerConfig()
        assert cfg.enabled is False
        assert cfg.host == "0.0.0.0"
        assert cfg.port == 8080
        assert cfg.cors_origins == ("*",)
        assert cfg.auth_token == ""
        assert cfg.voice_enabled is False

    def test_dashboard_config_defaults(self) -> None:
        from researchclaw.config import DashboardConfig

        cfg = DashboardConfig()
        assert cfg.enabled is True
        assert cfg.refresh_interval_sec == 5
        assert cfg.max_log_lines == 1000

    def test_parse_server_config(self) -> None:
        from researchclaw.config import _parse_server_config

        cfg = _parse_server_config({
            "enabled": True,
            "host": "127.0.0.1",
            "port": 9090,
            "auth_token": "secret123",
        })
        assert cfg.enabled is True
        assert cfg.host == "127.0.0.1"
        assert cfg.port == 9090
        assert cfg.auth_token == "secret123"

    def test_parse_server_config_empty(self) -> None:
        from researchclaw.config import _parse_server_config

        cfg = _parse_server_config({})
        assert cfg.enabled is False
        assert cfg.port == 8080

    def test_parse_dashboard_config(self) -> None:
        from researchclaw.config import _parse_dashboard_config

        cfg = _parse_dashboard_config({
            "refresh_interval_sec": 10,
            "max_log_lines": 500,
        })
        assert cfg.refresh_interval_sec == 10
        assert cfg.max_log_lines == 500

    def test_rcconfig_has_server_and_dashboard(self) -> None:
        from researchclaw.config import RCConfig, ServerConfig, DashboardConfig

        # Build minimal valid config dict
        data = {
            "project": {"name": "test"},
            "research": {"topic": "test topic"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "console"},
            "knowledge_base": {"root": "knowledge"},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost",
                "api_key_env": "TEST_KEY",
            },
            "server": {"enabled": True, "port": 9999},
            "dashboard": {"refresh_interval_sec": 3},
        }
        cfg = RCConfig.from_dict(data, check_paths=False)
        assert isinstance(cfg.server, ServerConfig)
        assert cfg.server.enabled is True
        assert cfg.server.port == 9999
        assert isinstance(cfg.dashboard, DashboardConfig)
        assert cfg.dashboard.refresh_interval_sec == 3


# ---------------------------------------------------------------------------
# CLI tests
# ---------------------------------------------------------------------------


class TestCLI:
    """Test new CLI subcommands are registered."""

    def test_serve_subcommand_exists(self) -> None:
        from researchclaw.cli import main

        with pytest.raises(SystemExit) as exc:
            main(["serve", "--help"])
        assert exc.value.code == 0

    def test_dashboard_subcommand_exists(self) -> None:
        from researchclaw.cli import main

        with pytest.raises(SystemExit) as exc:
            main(["dashboard", "--help"])
        assert exc.value.code == 0

    def test_wizard_subcommand_exists(self) -> None:
        from researchclaw.cli import main

        with pytest.raises(SystemExit) as exc:
            main(["wizard", "--help"])
        assert exc.value.code == 0


# ---------------------------------------------------------------------------
# Intent classification tests
# ---------------------------------------------------------------------------


class TestIntents:
    """Test intent classification."""

    def test_help_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, conf = classify_intent("help")
        assert intent == Intent.HELP

    def test_status_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("What stage are we at?")
        assert intent == Intent.CHECK_STATUS

    def test_start_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("Start the pipeline")
        assert intent == Intent.START_PIPELINE

    def test_topic_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("Help me find a research direction")
        assert intent == Intent.TOPIC_SELECTION

    def test_results_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("What are the results?")
        assert intent == Intent.DISCUSS_RESULTS

    def test_config_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("Change the learning rate to 0.001")
        assert intent == Intent.MODIFY_CONFIG

    def test_paper_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("Edit the abstract")
        assert intent == Intent.EDIT_PAPER

    def test_general_intent(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("Hello there")
        assert intent == Intent.GENERAL_CHAT

    def test_chinese_status(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("现在到哪一步了")
        assert intent == Intent.CHECK_STATUS

    def test_chinese_start(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, _ = classify_intent("开始跑实验")
        assert intent == Intent.START_PIPELINE

    def test_empty_message(self) -> None:
        from researchclaw.server.dialog.intents import Intent, classify_intent

        intent, conf = classify_intent("")
        assert intent == Intent.GENERAL_CHAT
        assert conf == 0.0


# ---------------------------------------------------------------------------
# Session management tests
# ---------------------------------------------------------------------------


class TestSession:
    """Test chat session management."""

    def test_session_create(self) -> None:
        from researchclaw.server.dialog.session import SessionManager

        mgr = SessionManager()
        session = mgr.get_or_create("client1")
        assert session.client_id == "client1"
        assert len(session.history) == 0

    def test_session_add_message(self) -> None:
        from researchclaw.server.dialog.session import SessionManager

        mgr = SessionManager()
        session = mgr.get_or_create("client1")
        session.add_message("user", "Hello")
        session.add_message("assistant", "Hi!")
        assert len(session.history) == 2
        assert session.history[0].role == "user"

    def test_session_context(self) -> None:
        from researchclaw.server.dialog.session import SessionManager

        mgr = SessionManager()
        session = mgr.get_or_create("client1")
        for i in range(20):
            session.add_message("user", f"msg {i}")
        ctx = session.get_context(last_n=5)
        assert len(ctx) == 5

    def test_session_max_history(self) -> None:
        from researchclaw.server.dialog.session import ChatSession

        session = ChatSession(client_id="test")
        for i in range(100):
            session.add_message("user", f"msg {i}")
        assert len(session.history) <= session.MAX_HISTORY

    def test_session_persistence(self) -> None:
        from researchclaw.server.dialog.session import SessionManager

        with tempfile.TemporaryDirectory() as tmpdir:
            mgr = SessionManager(persist_dir=tmpdir)
            session = mgr.get_or_create("persist-test")
            session.add_message("user", "saved message")
            mgr.save("persist-test")

            # Load in new manager
            mgr2 = SessionManager(persist_dir=tmpdir)
            loaded = mgr2.load("persist-test")
            assert loaded is not None
            assert len(loaded.history) == 1
            assert loaded.history[0].content == "saved message"


# ---------------------------------------------------------------------------
# Dashboard collector tests
# ---------------------------------------------------------------------------


class TestDashboardCollector:
    """Test dashboard data collection from artifacts/."""

    def test_collect_empty_dir(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert runs == []

    def test_collect_run_with_checkpoint(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir) / "rc-20260315-abc123"
            run_dir.mkdir()
            ckpt = {"stage": 5, "stage_name": "LITERATURE_SCREEN", "status": "running"}
            (run_dir / "checkpoint.json").write_text(json.dumps(ckpt))

            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert len(runs) == 1
            assert runs[0].current_stage == 5
            assert runs[0].current_stage_name == "LITERATURE_SCREEN"

    def test_collect_run_active_heartbeat(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir) / "rc-20260315-test01"
            run_dir.mkdir()
            hb = {"timestamp": time.time()}  # fresh heartbeat
            (run_dir / "heartbeat.json").write_text(json.dumps(hb))

            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert len(runs) == 1
            assert runs[0].is_active is True

    def test_collect_run_stale_heartbeat(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir) / "rc-20260315-stale1"
            run_dir.mkdir()
            hb = {"timestamp": time.time() - 120}  # old heartbeat
            (run_dir / "heartbeat.json").write_text(json.dumps(hb))

            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert runs[0].is_active is False

    def test_collect_stage_directories(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir) / "rc-20260315-stages"
            run_dir.mkdir()
            (run_dir / "stage-01").mkdir()
            (run_dir / "stage-02").mkdir()
            (run_dir / "stage-03").mkdir()

            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert len(runs[0].stages_completed) == 3

    def test_collect_metrics(self) -> None:
        from researchclaw.dashboard.collector import DashboardCollector

        with tempfile.TemporaryDirectory() as tmpdir:
            run_dir = Path(tmpdir) / "rc-20260315-metric"
            run_dir.mkdir()
            metrics = {"accuracy": 0.85, "loss": 0.12}
            (run_dir / "results.json").write_text(json.dumps(metrics))

            collector = DashboardCollector(artifacts_dir=tmpdir)
            runs = collector.collect_all()
            assert runs[0].metrics["accuracy"] == 0.85

    def test_snapshot_to_dict(self) -> None:
        from researchclaw.dashboard.collector import RunSnapshot

        snap = RunSnapshot(run_id="test-1", path="/tmp/test")
        d = snap.to_dict()
        assert d["run_id"] == "test-1"
        assert "current_stage" in d


# ---------------------------------------------------------------------------
# Metrics tests
# ---------------------------------------------------------------------------


class TestMetrics:
    """Test metric aggregation."""

    def test_aggregate_empty(self) -> None:
        from researchclaw.dashboard.metrics import aggregate_metrics

        result = aggregate_metrics([])
        assert result["total_runs"] == 0

    def test_aggregate_mixed(self) -> None:
        from researchclaw.dashboard.metrics import aggregate_metrics

        runs = [
            {"is_active": True, "status": "running", "current_stage": 10},
            {"is_active": False, "status": "completed", "current_stage": 23},
            {"is_active": False, "status": "failed", "current_stage": 5},
        ]
        result = aggregate_metrics(runs)
        assert result["total_runs"] == 3
        assert result["active_runs"] == 1
        assert result["completed_runs"] == 1
        assert result["failed_runs"] == 1

    def test_extract_training_curve(self) -> None:
        from researchclaw.dashboard.metrics import extract_training_curve

        metrics = {
            "training_log": [
                {"epoch": 1, "loss": 0.5, "accuracy": 0.7},
                {"epoch": 2, "loss": 0.3, "accuracy": 0.85},
            ]
        }
        curve = extract_training_curve(metrics)
        assert len(curve) == 2
        assert curve[1]["loss"] == 0.3


# ---------------------------------------------------------------------------
# Voice command tests
# ---------------------------------------------------------------------------


class TestVoiceCommands:
    """Test voice command parsing."""

    def test_start_command(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("start experiment")
        assert result.command == VoiceCommand.START

    def test_stop_command(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("stop")
        assert result.command == VoiceCommand.STOP

    def test_chinese_start(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("开始实验")
        assert result.command == VoiceCommand.START

    def test_chinese_pause(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("暂停")
        assert result.command == VoiceCommand.PAUSE

    def test_not_a_command(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("What about the neural network?")
        assert result.command == VoiceCommand.NONE

    def test_status_command(self) -> None:
        from researchclaw.voice.commands import VoiceCommand, parse_voice_input

        result = parse_voice_input("查看进度")
        assert result.command == VoiceCommand.STATUS


# ---------------------------------------------------------------------------
# Wizard tests
# ---------------------------------------------------------------------------


class TestWizard:
    """Test wizard templates and validation."""

    def test_list_templates(self) -> None:
        from researchclaw.wizard.templates import list_templates

        templates = list_templates()
        assert len(templates) >= 3
        names = [t["name"] for t in templates]
        assert "quick-demo" in names
        assert "standard-cv" in names

    def test_get_template(self) -> None:
        from researchclaw.wizard.templates import get_template

        tpl = get_template("quick-demo")
        assert tpl is not None
        assert tpl["experiment.mode"] == "simulated"

    def test_get_template_missing(self) -> None:
        from researchclaw.wizard.templates import get_template

        assert get_template("nonexistent") is None

    def test_wizard_web_mode(self) -> None:
        from researchclaw.wizard.quickstart import QuickStartWizard

        wizard = QuickStartWizard()
        config = wizard.run_web([
            {"key": "project_name", "value": "test-proj"},
            {"key": "topic", "value": "neural scaling laws"},
            {"key": "mode", "value": "docker"},
        ])
        assert config.get("project", {}).get("name") == "test-proj"
        assert config.get("research", {}).get("topic") == "neural scaling laws"

    def test_environment_detection(self) -> None:
        from researchclaw.wizard.validator import detect_environment

        report = detect_environment()
        assert report.has_python is True
        assert report.python_version != ""
        d = report.to_dict()
        assert "has_gpu" in d
        assert "recommendations" in d


# ---------------------------------------------------------------------------
# WebSocket events tests
# ---------------------------------------------------------------------------


class TestEvents:
    """Test WebSocket event types."""

    def test_event_serialization(self) -> None:
        from researchclaw.server.websocket.events import Event, EventType

        evt = Event(type=EventType.STAGE_COMPLETE, data={"stage": 5})
        json_str = evt.to_json()
        parsed = json.loads(json_str)
        assert parsed["type"] == "stage_complete"
        assert parsed["data"]["stage"] == 5

    def test_event_deserialization(self) -> None:
        from researchclaw.server.websocket.events import Event, EventType

        raw = json.dumps({
            "type": "heartbeat",
            "data": {"active_clients": 3},
            "timestamp": 1234567890.0,
        })
        evt = Event.from_json(raw)
        assert evt.type == EventType.HEARTBEAT
        assert evt.data["active_clients"] == 3

    def test_event_types_enum(self) -> None:
        from researchclaw.server.websocket.events import EventType

        assert EventType.CONNECTED.value == "connected"
        assert EventType.STAGE_START.value == "stage_start"
        assert EventType.CHAT_RESPONSE.value == "chat_response"


# ---------------------------------------------------------------------------
# Dialog router tests
# ---------------------------------------------------------------------------


class TestDialogRouter:
    """Test dialog message routing."""

    @pytest.mark.asyncio
    async def test_route_help_message(self) -> None:
        from researchclaw.server.dialog.router import route_message

        response = await route_message("help", "test-client")
        assert "help" in response.lower() or "I can" in response

    @pytest.mark.asyncio
    async def test_route_json_message(self) -> None:
        from researchclaw.server.dialog.router import route_message

        msg = json.dumps({"message": "help me"})
        response = await route_message(msg, "test-client-2")
        assert isinstance(response, str)
        assert len(response) > 0

    @pytest.mark.asyncio
    async def test_route_status_message(self) -> None:
        from researchclaw.server.dialog.router import route_message

        response = await route_message("What's the current progress?", "test-client-3")
        assert isinstance(response, str)


# ---------------------------------------------------------------------------
# FastAPI app tests (requires fastapi + httpx)
# ---------------------------------------------------------------------------


class TestFastAPIApp:
    """Test FastAPI application if dependencies are available."""

    @pytest.fixture
    def _skip_if_no_fastapi(self) -> None:
        try:
            import fastapi
            import httpx
        except ImportError:
            pytest.skip("fastapi/httpx not installed")

    @pytest.fixture
    def app(self, _skip_if_no_fastapi: None) -> object:
        from researchclaw.config import RCConfig

        data = {
            "project": {"name": "test"},
            "research": {"topic": "test"},
            "runtime": {"timezone": "UTC"},
            "notifications": {"channel": "console"},
            "knowledge_base": {"root": "knowledge"},
            "llm": {
                "provider": "openai-compatible",
                "base_url": "http://localhost",
                "api_key_env": "TEST",
            },
        }
        config = RCConfig.from_dict(data, check_paths=False)
        from researchclaw.server.app import create_app
        return create_app(config)

    @pytest.mark.asyncio
    async def test_health_endpoint(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/health")
            assert resp.status_code == 200
            data = resp.json()
            assert data["status"] == "ok"

    @pytest.mark.asyncio
    async def test_config_endpoint(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/config")
            assert resp.status_code == 200
            data = resp.json()
            assert data["project"] == "test"

    @pytest.mark.asyncio
    async def test_pipeline_status_idle(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/pipeline/status")
            assert resp.status_code == 200
            assert resp.json()["status"] == "idle"

    @pytest.mark.asyncio
    async def test_pipeline_stages(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/pipeline/stages")
            assert resp.status_code == 200
            stages = resp.json()["stages"]
            assert len(stages) == 23

    @pytest.mark.asyncio
    async def test_runs_list(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/runs")
            assert resp.status_code == 200
            assert "runs" in resp.json()

    @pytest.mark.asyncio
    async def test_projects_list(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.get("/api/projects")
            assert resp.status_code == 200
            assert "projects" in resp.json()

    @pytest.mark.asyncio
    async def test_stop_pipeline_404_when_idle(self, app: object) -> None:
        from httpx import AsyncClient, ASGITransport

        transport = ASGITransport(app=app)  # type: ignore[arg-type]
        async with AsyncClient(transport=transport, base_url="http://test") as ac:
            resp = await ac.post("/api/pipeline/stop")
            assert resp.status_code == 404


================================================
FILE: tests/test_web_scholar.py
================================================
"""Tests for researchclaw.web.scholar — GoogleScholarClient."""

from __future__ import annotations

import time
from unittest.mock import MagicMock, patch

import pytest

from researchclaw.web.scholar import GoogleScholarClient, ScholarPaper


# ---------------------------------------------------------------------------
# ScholarPaper dataclass
# ---------------------------------------------------------------------------


class TestScholarPaper:
    def test_to_dict(self):
        p = ScholarPaper(
            title="Attention Is All You Need",
            authors=["Vaswani", "Shazeer"],
            year=2017,
            citation_count=50000,
        )
        d = p.to_dict()
        assert d["title"] == "Attention Is All You Need"
        assert d["year"] == 2017
        assert d["source"] == "google_scholar"

    def test_to_literature_paper(self):
        p = ScholarPaper(
            title="Test Paper",
            authors=["Author One", "Author Two"],
            year=2024,
            abstract="An abstract.",
            citation_count=100,
            url="https://example.com",
        )
        lit = p.to_literature_paper()
        assert lit.title == "Test Paper"
        assert lit.source == "google_scholar"
        assert len(lit.authors) == 2
        assert lit.authors[0].name == "Author One"


# ---------------------------------------------------------------------------
# GoogleScholarClient
# ---------------------------------------------------------------------------


class TestGoogleScholarClient:
    @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True)
    def test_available_always_true(self):
        """scholarly is now an installed dependency, always available."""
        client = GoogleScholarClient()
        assert client.available

    def test_parse_pub_full(self):
        """Test _parse_pub with a complete publication dict."""
        pub = {
            "bib": {
                "title": "Deep Learning",
                "author": ["LeCun", "Bengio", "Hinton"],
                "pub_year": "2015",
                "abstract": "Deep learning review.",
                "venue": "Nature",
            },
            "num_citations": 30000,
            "pub_url": "https://nature.com/dl",
            "cites_id": ["abc123"],
        }
        paper = GoogleScholarClient._parse_pub(pub)
        assert paper.title == "Deep Learning"
        assert paper.year == 2015
        assert paper.citation_count == 30000
        assert "LeCun" in paper.authors
        assert paper.venue == "Nature"

    def test_parse_pub_string_authors(self):
        pub = {
            "bib": {
                "title": "Paper",
                "author": "Smith and Jones",
                "pub_year": "2023",
            },
            "num_citations": 10,
            "pub_url": "https://example.com",
        }
        paper = GoogleScholarClient._parse_pub(pub)
        assert paper.title == "Paper"
        assert "Smith" in paper.authors
        assert "Jones" in paper.authors

    def test_parse_pub_missing_fields(self):
        pub = {"bib": {}, "num_citations": 0}
        paper = GoogleScholarClient._parse_pub(pub)
        assert paper.title == ""
        assert paper.year == 0
        assert paper.authors == []

    @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True)
    def test_rate_limiting(self):
        client = GoogleScholarClient(inter_request_delay=0.01)
        t0 = time.monotonic()
        client._rate_limit()
        client._rate_limit()
        elapsed = time.monotonic() - t0
        assert elapsed >= 0.01

    @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True)
    @patch("researchclaw.web.scholar.scholarly")
    def test_search_with_mocked_scholarly(self, mock_scholarly):
        """Test search using mocked scholarly library."""
        mock_pub = {
            "bib": {
                "title": "Test Paper",
                "author": ["Author A"],
                "pub_year": "2024",
            },
            "num_citations": 5,
            "pub_url": "https://example.com",
        }
        mock_scholarly.search_pubs.return_value = iter([mock_pub])

        client = GoogleScholarClient(inter_request_delay=0.0)
        results = client.search("test query", limit=5)
        assert len(results) == 1
        assert results[0].title == "Test Paper"

    @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True)
    @patch("researchclaw.web.scholar.scholarly")
    def test_search_error_graceful(self, mock_scholarly):
        """Search should return empty list on error, not raise."""
        mock_scholarly.search_pubs.side_effect = Exception("Rate limited")

        client = GoogleScholarClient(inter_request_delay=0.0)
        results = client.search("test query")
        assert results == []


================================================
FILE: tests/test_web_search.py
================================================
"""Tests for researchclaw.web.search — WebSearchClient."""

from __future__ import annotations

from unittest.mock import MagicMock, patch

import pytest

from researchclaw.web.search import SearchResult, WebSearchClient, WebSearchResponse


# ---------------------------------------------------------------------------
# SearchResult dataclass
# ---------------------------------------------------------------------------


class TestSearchResult:
    def test_to_dict(self):
        r = SearchResult(
            title="Test", url="https://example.com", snippet="A snippet", source="tavily"
        )
        d = r.to_dict()
        assert d["title"] == "Test"
        assert d["url"] == "https://example.com"
        assert d["source"] == "tavily"


# ---------------------------------------------------------------------------
# WebSearchResponse dataclass
# ---------------------------------------------------------------------------


class TestWebSearchResponse:
    def test_has_results_true(self):
        r = WebSearchResponse(
            query="test", results=[SearchResult(title="A", url="u")],
        )
        assert r.has_results

    def test_has_results_false(self):
        r = WebSearchResponse(query="test")
        assert not r.has_results


# ---------------------------------------------------------------------------
# DuckDuckGo HTML parsing
# ---------------------------------------------------------------------------


class TestDDGParsing:
    def test_parse_ddg_html_basic(self):
        html = """
        <div class="result">
            <a class="result__a" href="https://example.com/1">Title One</a>
            <a class="result__snippet">Snippet one here</a>
        </div>
        <div class="result">
            <a class="result__a" href="https://example.com/2">Title Two</a>
            <a class="result__snippet">Snippet two here</a>
        </div>
        """
        results = WebSearchClient._parse_ddg_html(html, limit=10)
        assert len(results) == 2
        assert results[0].title == "Title One"
        assert results[0].url == "https://example.com/1"
        assert results[0].snippet == "Snippet one here"

    def test_parse_ddg_html_skips_ddg_links(self):
        html = """
        <a class="result__a" href="https://duckduckgo.com/internal">DDG Link</a>
        <a class="result__a" href="https://example.com/real">Real</a>
        """
        results = WebSearchClient._parse_ddg_html(html, limit=10)
        assert len(results) == 1
        assert results[0].url == "https://example.com/real"

    def test_parse_ddg_html_respects_limit(self):
        html = ""
        for i in range(20):
            html += f'<a class="result__a" href="https://ex.com/{i}">T{i}</a>\n'
        results = WebSearchClient._parse_ddg_html(html, limit=5)
        assert len(results) == 5


# ---------------------------------------------------------------------------
# WebSearchClient.search
# ---------------------------------------------------------------------------


class TestWebSearchClient:
    @patch("researchclaw.web.search.urlopen")
    def test_search_ddg_fallback_no_api_key(self, mock_urlopen):
        """When no API key is set, uses DuckDuckGo fallback."""
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"""
        <a class="result__a" href="https://paper.com">Paper Title</a>
        <a class="result__snippet">About the paper</a>
        """
        mock_urlopen.return_value = mock_resp

        client = WebSearchClient(api_key="")  # No API key
        response = client.search("test query")
        assert response.source == "duckduckgo"

    @patch("researchclaw.web.search.urlopen")
    def test_search_ddg_error_graceful(self, mock_urlopen):
        mock_urlopen.side_effect = Exception("Network error")

        client = WebSearchClient(api_key="")
        response = client.search("test query")
        assert response.source == "duckduckgo"
        assert len(response.results) == 0

    def test_search_tavily_with_mock(self):
        """Test Tavily search with mocked SDK."""
        mock_client_instance = MagicMock()
        mock_client_instance.search.return_value = {
            "results": [
                {
                    "title": "Tavily Result",
                    "url": "https://tavily.com/r1",
                    "content": "Content from Tavily",
                    "score": 0.95,
                }
            ],
            "answer": "AI summary answer",
        }

        mock_tavily_module = MagicMock()
        mock_tavily_module.TavilyClient.return_value = mock_client_instance

        with patch.dict("sys.modules", {"tavily": mock_tavily_module}):
            client = WebSearchClient(api_key="test-key")
            import time
            response = client._search_tavily("test query", 10, None, None, time.monotonic())
            assert response.source == "tavily"
            assert len(response.results) == 1
            assert response.results[0].title == "Tavily Result"
            assert response.answer == "AI summary answer"

    @patch("researchclaw.web.search.urlopen")
    def test_search_multi_deduplication(self, mock_urlopen):
        mock_resp = MagicMock()
        mock_resp.read.return_value = b"""
        <a class="result__a" href="https://ex.com/same">Same Result</a>
        """
        mock_urlopen.return_value = mock_resp

        client = WebSearchClient(api_key="")
        responses = client.search_multi(["query1", "query2"], inter_query_delay=0.0)
        assert len(responses) == 2
        # Second query should have same URL deduped
        if responses[0].results:
            assert all(
                r.url != responses[0].results[0].url
                for r in responses[1].results
            )


================================================
FILE: website/features.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Features &mdash; AutoResearchClaw</title>
  <meta name="description" content="AutoResearchClaw features: real literature search, Docker sandbox, multi-agent review, iterative refinement, and more.">
  <link rel="icon" href="assets/logo.png" type="image/png">
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="style.css">
</head>
<body>

<!-- Nav -->
<nav class="navbar">
  <div class="container">
    <a href="index.html" class="nav-brand"><img src="assets/logo.png" alt="Logo">AutoResearchClaw</a>
    <button class="nav-toggle" onclick="document.querySelector('.nav-links').classList.toggle('open')">&#9776;</button>
    <ul class="nav-links">
      <li><a href="index.html">Home</a></li>
      <li><a href="pipeline.html">Pipeline</a></li>
      <li><a href="features.html" class="active">Features</a></li>
      <li><a href="papers.html">Papers</a></li>
      <li><a href="getting-started.html">Get Started</a></li>
      <li><a href="https://github.com/aiming-lab/AutoResearchClaw" class="nav-github" target="_blank">GitHub</a></li>
    </ul>
  </div>
</nav>

<!-- Header -->
<section class="hero" style="padding-bottom: 2rem;">
  <div class="container">
    <h1><span class="gradient">Features</span></h1>
    <p class="tagline">Everything you need for autonomous research paper generation, built for reliability and quality.</p>
  </div>
</section>

<!-- Features Grid -->
<section class="section" style="padding-top: 1rem;">
  <div class="container">
    <div class="feature-grid">

      <div class="feature-card">
        <div class="feature-icon bg-blue">&#128269;</div>
        <h3>Multi-Source Literature Search</h3>
        <p>Searches OpenAlex (primary, 10K/day), Semantic Scholar, and arXiv in parallel. Intelligent source fallback ensures results even when individual APIs are rate-limited.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-blue">&#9889;</div>
        <h3>Rate Limit Defense</h3>
        <p>Five-layer defense: adaptive rate limiter, three-state circuit breaker, multi-source fallback, intelligent caching with per-source TTL, and request optimization via S2 batch API.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-green">&#128011;</div>
        <h3>Docker Sandbox with GPU</h3>
        <p>Experiments run in isolated Docker containers based on nvidia/cuda:12.4.1 with PyTorch, GPU passthrough, network sandboxing, and pre-cached datasets (CIFAR-10, FashionMNIST).</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-green">&#128187;</div>
        <h3>Hardware-Aware Design</h3>
        <p>Automatically detects available GPU memory and adjusts experiment parameters (batch size, model size, training epochs) to fit within hardware constraints.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-purple">&#129302;</div>
        <h3>Multi-Agent Peer Review</h3>
        <p>Simulated conference-style peer review with multiple LLM reviewer personas providing structured feedback on technical soundness, methodology, and clarity.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-purple">&#128260;</div>
        <h3>Pivot / Refine / Proceed</h3>
        <p>After analyzing experiment results, the pipeline autonomously decides whether to proceed with paper writing, refine the experiment, or pivot to a new hypothesis (max 2 pivots).</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-amber">&#128200;</div>
        <h3>Experiment Charts</h3>
        <p>Automatically generates publication-quality comparison charts, filtering out timing/meta metrics. Supports bar charts, learning curves, and ablation visualizations.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-red">&#128220;</div>
        <h3>Conference-Ready LaTeX</h3>
        <p>Outputs publication-quality LaTeX with proper bibliography, figure placement, and conference formatting (NeurIPS, ICLR, ICML templates).</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-blue">&#9989;</div>
        <h3>Citation Verification</h3>
        <p>Every cited paper is verified against real academic databases (CrossRef, OpenAlex, arXiv, Semantic Scholar) in optimized order to minimize API pressure.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-green">&#128230;</div>
        <h3>Result Caching</h3>
        <p>Per-source cache TTL: arXiv results cached 24h (daily metadata updates), S2/OpenAlex 3 days, citation verification results cached permanently.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-purple">&#128218;</div>
        <h3>Seminal Paper Library</h3>
        <p>Built-in seed library of foundational ML papers (normalization, ResNets, transformers, etc.) injected during literature search to ensure key references are cited.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-amber">&#128736;</div>
        <h3>Code Security Validation</h3>
        <p>Generated experiment code is validated for security (no network access, no subprocess calls, no file system writes outside workspace) before Docker execution.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-red">&#128161;</div>
        <h3>Contradiction Detection</h3>
        <p>Automatically detects contradictions in experiment results: null findings, negative results, and cases where control outperforms proposed method.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-blue">&#128203;</div>
        <h3>Quality Assessment</h3>
        <p>Built-in quality scoring across novelty, soundness, significance, clarity, and reproducibility. Papers below threshold trigger rewriting.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-green">&#128451;</div>
        <h3>Knowledge Archive</h3>
        <p>Research findings are archived in a persistent knowledge base (Markdown-backed) for cross-project knowledge transfer and future reference.</p>
      </div>

      <div class="feature-card">
        <div class="feature-icon bg-purple">&#127891;</div>
        <h3>LLM Fine-Tuning</h3>
        <p>Optional QLoRA/LoRA fine-tuning support for adapting language models to specific research domains and writing styles.</p>
      </div>

    </div>
  </div>
</section>

<!-- Comparison Table -->
<section class="section" style="background: var(--color-bg-alt);">
  <div class="container">
    <div class="section-header">
      <h2>How We Compare</h2>
      <p>AutoResearchClaw vs. other autonomous research tools</p>
    </div>

    <div style="overflow-x: auto;">
      <table style="width: 100%; border-collapse: collapse; font-size: 0.9rem;">
        <thead>
          <tr style="border-bottom: 2px solid var(--color-border);">
            <th style="text-align: left; padding: 0.75rem;">Feature</th>
            <th style="text-align: center; padding: 0.75rem; color: var(--color-primary);">AutoResearchClaw</th>
            <th style="text-align: center; padding: 0.75rem;">PaperClaw</th>
            <th style="text-align: center; padding: 0.75rem;">Sibyl</th>
            <th style="text-align: center; padding: 0.75rem;">Idea2Paper</th>
          </tr>
        </thead>
        <tbody>
          <tr style="border-bottom: 1px solid var(--color-border);">
            <td style="padding: 0.75rem;">Literature search</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">3 APIs + cache</td>
            <td style="text-align: center; padding: 0.75rem;">2 APIs</td>
            <td style="text-align: center; padding: 0.75rem;">arXiv only</td>
            <td style="text-align: center; padding: 0.75rem;">Offline KG</td>
          </tr>
          <tr style="border-bottom: 1px solid var(--color-border);">
            <td style="padding: 0.75rem;">Rate limit handling</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">Circuit breaker + fallback</td>
            <td style="text-align: center; padding: 0.75rem;">Exponential backoff</td>
            <td style="text-align: center; padding: 0.75rem;">None</td>
            <td style="text-align: center; padding: 0.75rem;">N/A</td>
          </tr>
          <tr style="border-bottom: 1px solid var(--color-border);">
            <td style="padding: 0.75rem;">Code execution</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">Docker + GPU</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
          </tr>
          <tr style="border-bottom: 1px solid var(--color-border);">
            <td style="padding: 0.75rem;">Peer review</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">Multi-agent</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
            <td style="text-align: center; padding: 0.75rem;">Single agent</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
          </tr>
          <tr style="border-bottom: 1px solid var(--color-border);">
            <td style="padding: 0.75rem;">Citation verification</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">4 API sources</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
            <td style="text-align: center; padding: 0.75rem;">No</td>
          </tr>
          <tr>
            <td style="padding: 0.75rem;">Pipeline stages</td>
            <td style="text-align: center; padding: 0.75rem; color: var(--color-success);">23</td>
            <td style="text-align: center; padding: 0.75rem;">~8</td>
            <td style="text-align: center; padding: 0.75rem;">~5</td>
            <td style="text-align: center; padding: 0.75rem;">~6</td>
          </tr>
        </tbody>
      </table>
    </div>
  </div>
</section>

<!-- Footer -->
<footer class="footer">
  <ul class="footer-links">
    <li><a href="index.html">Home</a></li>
    <li><a href="pipeline.html">Pipeline</a></li>
    <li><a href="features.html">Features</a></li>
    <li><a href="papers.html">Papers</a></li>
    <li><a href="getting-started.html">Get Started</a></li>
    <li><a href="https://github.com/aiming-lab/AutoResearchClaw" target="_blank">GitHub</a></li>
  </ul>
  <p>AutoResearchClaw &mdash; Autonomous Research Paper Generation Pipeline</p>
</footer>

</body>
</html>


================================================
FILE: website/getting-started.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Get Started &mdash; AutoResearchClaw</title>
  <meta name="description" content="Quick start guide for AutoResearchClaw: installation, configuration, and running your first autonomous research paper.">
  <link rel="icon" href="assets/logo.png" type="image/png">
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="style.css">
</head>
<body>

<!-- Nav -->
<nav class="navbar">
  <div class="container">
    <a href="index.html" class="nav-brand"><img src="assets/logo.png" alt="Logo">AutoResearchClaw</a>
    <button class="nav-toggle" onclick="document.querySelector('.nav-links').classList.toggle('open')">&#9776;</button>
    <ul class="nav-links">
      <li><a href="index.html">Home</a></li>
      <li><a href="pipeline.html">Pipeline</a></li>
      <li><a href="features.html">Features</a></li>
      <li><a href="papers.html">Papers</a></li>
      <li><a href="getting-started.html" class="active">Get Started</a></li>
      <li><a href="https://github.com/aiming-lab/AutoResearchClaw" class="nav-github" target="_blank">GitHub</a></li>
    </ul>
  </div>
</nav>

<!-- Header -->
<section class="hero" style="padding-bottom: 2rem;">
  <div class="container">
    <h1>Get <span class="gradient">Started</span></h1>
    <p class="tagline">From zero to your first autonomous research paper in minutes.</p>
  </div>
</section>

<!-- Steps -->
<section class="section" style="padding-top: 1rem;">
  <div class="container getting-started-content">

    <!-- Prerequisites -->
    <div class="step-block">
      <h3><span class="step-num">0</span> Prerequisites</h3>
      <ul style="list-style: none; padding: 0; margin: 0.5rem 0; font-size: 0.9rem; line-height: 2;">
        <li>&#9745; Python 3.10+</li>
        <li>&#9745; Docker with NVIDIA Container Toolkit (for GPU experiments)</li>
        <li>&#9745; An OpenAI-compatible API key (Azure OpenAI, OpenAI, or local LLM)</li>
        <li>&#9745; NVIDIA GPU with 8GB+ VRAM (optional, for Docker sandbox)</li>
      </ul>
    </div>

    <!-- Step 1 -->
    <div class="step-block">
      <h3><span class="step-num">1</span> Clone the Repository</h3>
      <div class="code-block">
git clone https://github.com/aiming-lab/AutoResearchClaw.git<br>
cd AutoResearchClaw
      </div>
    </div>

    <!-- Step 2 -->
    <div class="step-block">
      <h3><span class="step-num">2</span> Install Dependencies</h3>
      <div class="code-block">
pip install -e .
      </div>
      <p style="margin-top: 0.5rem; font-size: 0.85rem; color: var(--color-text-muted);">
        This installs the <code>researchclaw</code> package and all required dependencies.
      </p>
    </div>

    <!-- Step 3 -->
    <div class="step-block">
      <h3><span class="step-num">3</span> Configure Your LLM</h3>
      <p style="font-size: 0.9rem; margin-bottom: 0.5rem;">
        Create a YAML config file (e.g., <code>config.yaml</code>) with your LLM settings:
      </p>
      <div class="code-block">
<span class="comment"># config.yaml</span><br>
project:<br>
&nbsp;&nbsp;name: "my-first-paper"<br>
&nbsp;&nbsp;mode: "docs-first"<br>
<br>
research:<br>
&nbsp;&nbsp;topic: "Your research topic here"<br>
<br>
llm:<br>
&nbsp;&nbsp;provider: "openai-compatible"<br>
&nbsp;&nbsp;base_url: "https://api.openai.com/v1"<br>
&nbsp;&nbsp;api_key_env: "OPENAI_API_KEY"<br>
<br>
experiment:<br>
&nbsp;&nbsp;backend: "docker"   <span class="comment"># or "subprocess" for local</span><br>
&nbsp;&nbsp;timeout_sec: 1800
      </div>
    </div>

    <!-- Step 4 -->
    <div class="step-block">
      <h3><span class="step-num">4</span> Set Your API Key</h3>
      <div class="code-block">
export OPENAI_API_KEY="sk-your-key-here"
      </div>
    </div>

    <!-- Step 5 -->
    <div class="step-block">
      <h3><span class="step-num">5</span> Build the Docker Image (Optional)</h3>
      <p style="font-size: 0.9rem; margin-bottom: 0.5rem;">
        If using the Docker sandbox backend for GPU-accelerated experiments:
      </p>
      <div class="code-block">
docker build -t researchclaw-sandbox -f researchclaw/docker/Dockerfile .
      </div>
    </div>

    <!-- Step 6 -->
    <div class="step-block">
      <h3><span class="step-num">6</span> Run Your First Paper</h3>
      <div class="code-block">
python -m researchclaw run --config config.yaml
      </div>
      <p style="margin-top: 0.5rem; font-size: 0.85rem; color: var(--color-text-muted);">
        The pipeline will execute all 23 stages autonomously. Output will be saved
        to the <code>output/</code> directory including the paper PDF, LaTeX source,
        experiment code, and charts.
      </p>
    </div>

    <!-- Step 7 -->
    <div class="step-block">
      <h3><span class="step-num">7</span> Review Your Paper</h3>
      <p style="font-size: 0.9rem;">
        After the pipeline completes, find your generated paper at:
      </p>
      <div class="code-block">
output/&lt;run-id&gt;/<br>
&nbsp;&nbsp;paper.pdf&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="comment"># Final PDF</span><br>
&nbsp;&nbsp;paper.tex&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="comment"># LaTeX source</span><br>
&nbsp;&nbsp;references.bib&nbsp;&nbsp;&nbsp;<span class="comment"># Bibliography</span><br>
&nbsp;&nbsp;code/main.py&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="comment"># Experiment code</span><br>
&nbsp;&nbsp;charts/&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="comment"># Generated figures</span><br>
&nbsp;&nbsp;results.json&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="comment"># Experiment metrics</span>
      </div>
    </div>

    <!-- Tips -->
    <div class="step-block" style="border-color: var(--color-primary);">
      <h3 style="color: var(--color-primary);">Tips</h3>
      <ul style="list-style: disc; padding-left: 1.25rem; font-size: 0.9rem; line-height: 1.8; color: var(--color-text-muted);">
        <li>Use GPT-4.1 or newer for best paper quality</li>
        <li>Set <code>timeout_sec: 3600</code> for complex experiments</li>
        <li>For Azure OpenAI, set <code>provider: "azure_openai"</code> and configure your endpoint</li>
        <li>The pipeline caches literature results, so re-runs with the same topic are faster</li>
        <li>Run <code>python -m pytest tests/ -v</code> to verify your installation</li>
      </ul>
    </div>

    <div class="text-center mt-8">
      <a href="https://github.com/aiming-lab/AutoResearchClaw#readme" class="btn btn-primary" target="_blank">View Full Documentation on GitHub</a>
    </div>

  </div>
</section>

<!-- Footer -->
<footer class="footer">
  <ul class="footer-links">
    <li><a href="index.html">Home</a></li>
    <li><a href="pipeline.html">Pipeline</a></li>
    <li><a href="features.html">Features</a></li>
    <li><a href="papers.html">Papers</a></li>
    <li><a href="getting-started.html">Get Started</a></li>
    <li><a href="https://github.com/aiming-lab/AutoResearchClaw" target="_blank">GitHub</a></li>
  </ul>
  <p>AutoResearchClaw &mdash; Autonomous Research Paper Generation Pipeline</p>
</footer>

</body>
</html>


================================================
FILE: website/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>AutoResearchClaw — Autonomous Research Paper Generation</title>
  <meta name="description" content="Chat an idea, get a conference paper. AutoResearchClaw is a 23-stage autonomous pipeline that generates research papers from a single topic prompt.">
  <link rel="icon" href="assets/logo.png" type="image/png">
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="style.css">
</head>
<body>

<!-- ============ Navigation ============ -->
<nav class="navbar">
  <div class="container">
    <a href="index.html" class="nav-brand">
      <img src="assets/logo.png" alt="Logo">
      AutoResearchClaw
    </a>
    <button class="nav-toggle" aria-label="Toggle navigation" onclick="document.querySelector('.nav-links').classList.toggle('open')">&#9776;</button>
    <ul class="nav-links">
      <li><a href="index.html" class="active">Home</a></li>
      <li><a href="pipeline.html">Pipeline</a></li>
      <li><a href="features.html">Features</a></li>
      <li><a href="papers.html">Papers</a></li>
      <li><a href="getting-started.html">Get Started</a></li>
      <li><a href="https://github.com/aiming-lab/AutoResearchClaw" class="nav-github" target="_blank">GitHub</a></li>
    </ul>
  </div>
</nav>

<!-- ============ Hero ============ -->
<section class="hero">
  <div class="container">
    <h1>Chat an Idea.<br><span class="gradient">Get a Paper.</span></h1>
    <p class="tagline">
      AutoResearchClaw is a fully autonomous 23-stage pipeline that transforms a research topic
      into a conference-ready paper &mdash; with real experiments, GPU-accelerated code,
      and verified citations.
    </p>
    <div class="hero-actions">
      <a href="getting-started.html" class="btn btn-primary">Get Started</a>
      <a href="pipeline.html" class="btn btn-outline">Explore Pipeline</a>
    </div>
    <div class="hero-code">
      <span class="comment"># one command, one paper</span><br>
      python -m researchclaw run --topic "your research idea"
    </div>
  </div>
</section>

<!-- ============ Stats ============ -->
<section class="container">
  <div class="stats">
    <div class="stat">
      <div class="stat-value">23</div>
      <div class="stat-label">Autonomous Stages</div>
    </div>
    <div class="stat">
      <div class="stat-value">1117</div>
      <div class="stat-label">Tests Passing</div>
    </div>
    <div class="stat">
      <div class="stat-value">3</div>
      <div class="stat-label">Literature APIs</div>
    </div>
    <div class="stat">
      <div class="stat-value">GPU</div>
      <div class="stat-label">Docker Sandbox</div>
    </div>
  </div>
</section>

<!-- ============ Pipeline Overview ============ -->
<section class="section">
  <div class="container">
    <div class="section-header">
      <h2>From Idea to Paper in 23 Steps</h2>
      <p>Eight autonomous phases transform a research topic into a publication-ready manuscript.</p>
    </div>

    <div class="pipeline-preview">
      <!-- Phase A -->
      <div class="phase-card">
        <div class="phase-icon">&#127919;</div>
        <h3>A: Research Scoping</h3>
        <p>Topic initialization, problem decomposition, and scope definition.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase B -->
      <div class="phase-card">
        <div class="phase-icon">&#128218;</div>
        <h3>B: Literature Discovery</h3>
        <p>Multi-source paper search via OpenAlex, Semantic Scholar, and arXiv with quality screening.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot gate"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase C -->
      <div class="phase-card">
        <div class="phase-icon">&#129504;</div>
        <h3>C: Knowledge Synthesis</h3>
        <p>Gap analysis, trend synthesis, and novel hypothesis generation.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase D -->
      <div class="phase-card">
        <div class="phase-icon">&#9881;&#65039;</div>
        <h3>D: Experiment Design</h3>
        <p>Methodology design, code generation, and resource planning with hardware awareness.</p>
        <div class="phase-stages">
          <div class="stage-dot gate"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase E -->
      <div class="phase-card">
        <div class="phase-icon">&#128640;</div>
        <h3>E: Experiment Execution</h3>
        <p>GPU-accelerated Docker sandbox execution with iterative refinement.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase F -->
      <div class="phase-card">
        <div class="phase-icon">&#128202;</div>
        <h3>F: Analysis &amp; Decision</h3>
        <p>Result analysis with pivot/refine/proceed decisions.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase G -->
      <div class="phase-card">
        <div class="phase-icon">&#9997;&#65039;</div>
        <h3>G: Paper Writing</h3>
        <p>Structured drafting, multi-agent peer review, and iterative revision.</p>
        <div class="phase-stages">
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
      <!-- Phase H -->
      <div class="phase-card">
        <div class="phase-icon">&#10004;&#65039;</div>
        <h3>H: Finalization</h3>
        <p>Quality gate, knowledge archival, LaTeX export, and citation verification.</p>
        <div class="phase-stages">
          <div class="stage-dot gate"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
          <div class="stage-dot"></div>
        </div>
      </div>
    </div>

    <div class="text-center mt-8">
      <a href="pipeline.html" class="btn btn-outline">View Full Pipeline &rarr;</a>
    </div>
  </div>
</section>

<!-- ============ Feature Highlights ============ -->
<section class="section" style="background: var(--color-bg-alt);">
  <div class="container">
    <div class="section-header">
      <h2>Key Features</h2>
      <p>Built for serious research, engineered for reliability.</p>
    </div>

    <div class="feature-grid">
      <div class="feature-card">
        <div class="feature-icon bg-blue">&#128269;</div>
        <h3>Real Literature Search</h3>
        <p>Multi-source search across OpenAlex, Semantic Scholar, and arXiv with circuit breakers, rate limiting, and intelligent caching.</p>
      </div>
      <div class="feature-card">
        <div class="feature-icon bg-green">&#128011;</div>
        <h3>Docker Sandbox + GPU</h3>
        <p>Experiments run in isolated Docker containers with NVIDIA GPU passthrough, network sandboxing, and automatic dependency management.</p>
      </div>
      <div class="feature-card">
        <div class="feature-icon bg-purple">&#129302;</div>
        <h3>Multi-Agent Peer Review</h3>
        <p>Simulated conference-style peer review with multiple reviewer personas providing structured feedback for revision.</p>
      </div>
      <div class="feature-card">
        <div class="feature-icon bg-amber">&#128260;</div>
        <h3>Iterative Refinement</h3>
        <p>Automatic pivot/refine/proceed decisions with rollback to any previous stage based on experiment outcomes.</p>
      </div>
      <div class="feature-card">
        <div class="feature-icon bg-red">&#128220;</div>
        <h3>Conference-Ready LaTeX</h3>
        <p>Publication-quality LaTeX output with proper citations, experiment charts, and structured abstracts.</p>
      </div>
      <div class="feature-card">
        <div class="feature-icon bg-blue">&#9989;</div>
        <h3>Citation Verification</h3>
        <p>All citations verified against CrossRef, OpenAlex, and arXiv APIs to ensure bibliography accuracy.</p>
      </div>
    </div>

    <div class="text-center mt-8">
      <a href="features.html" class="btn btn-outline">All Features &rarr;</a>
    </div>
  </div>
</section>

<!-- ============ Showcase Papers ============ -->
<section class="section">
  <div class="container">
    <div class="section-header">
      <h2>Showcase Papers</h2>
      <p>Papers generated entirely by the pipeline, from topic to camera-ready PDF.</p>
    </div>

    <div class="paper-grid">
      <div class="paper-card">
        <div class="paper-thumb">&#128196;</div>
        <div class="paper-body">
          <h3>Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Computer Vision</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">Investigates adaptive curriculum strategies on CIFAR-10/100 benchmarks, demonstrating improved convergence speed and final accuracy compared to standard training.</p>
        </div>
      </div>

      <div class="paper-card">
        <div class="paper-thumb">&#128196;</div>
        <div class="paper-body">
          <h3>Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Domain Adaptation</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">Explores test-time adaptation methods using batch normalization statistics to handle distribution shift on CIFAR-10-C corruption benchmarks.</p>
        </div>
      </div>

      <div class="paper-card">
        <div class="paper-thumb">&#128196;</div>
        <div class="paper-body">
          <h3>Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Reinforcement Learning</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">Proposes entropy-guided intrinsic reward bonuses to improve exploration in sparse-reward MuJoCo locomotion environments.</p>
        </div>
      </div>
    </div>

    <div class="text-center mt-8">
      <a href="papers.html" class="btn btn-outline">View All Papers &rarr;</a>
    </div>
  </div>
</section>

<!-- ============ Framework Diagram ============ -->
<section class="section" style="background: var(--color-bg-alt);">
  <div class="container text-center">
    <div class="section-header">
      <h2>System Architecture</h2>
      <p>End-to-end pipeline architecture from topic input to published paper.</p>
    </div>
    <img src="assets/framework.png" alt="AutoResearchClaw Framework" style="max-width: 900px; margin: 0 auto; border-radius: var(--radius-lg); border: 1px solid var(--color-border);">
  </div>
</section>

<!-- ============ CTA ============ -->
<section class="section text-center">
  <div class="container">
    <h2>Ready to Generate Your First Paper?</h2>
    <p class="tagline" style="margin-top: 0.5rem;">
      Clone the repo, configure your LLM API key, and run your first autonomous research paper.
    </p>
    <div class="hero-actions mt-8">
      <a href="getting-started.html" class="btn btn-primary">Quick Start Guide</a>
      <a href="https://github.com/aiming-lab/AutoResearchClaw" class="btn btn-outline" target="_blank">View on GitHub</a>
    </div>
  </div>
</section>

<!-- ============ Footer ============ -->
<footer class="footer">
  <ul class="footer-links">
    <li><a href="index.html">Home</a></li>
    <li><a href="pipeline.html">Pipeline</a></li>
    <li><a href="features.html">Features</a></li>
    <li><a href="papers.html">Papers</a></li>
    <li><a href="getting-started.html">Get Started</a></li>
    <li><a href="https://github.com/aiming-lab/AutoResearchClaw" target="_blank">GitHub</a></li>
  </ul>
  <p>AutoResearchClaw &mdash; Autonomous Research Paper Generation Pipeline</p>
</footer>

</body>
</html>


================================================
FILE: website/papers.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Showcase Papers &mdash; AutoResearchClaw</title>
  <meta name="description" content="Papers generated entirely by AutoResearchClaw's autonomous pipeline, from topic to camera-ready PDF.">
  <link rel="icon" href="assets/logo.png" type="image/png">
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="style.css">
</head>
<body>

<!-- Nav -->
<nav class="navbar">
  <div class="container">
    <a href="index.html" class="nav-brand"><img src="assets/logo.png" alt="Logo">AutoResearchClaw</a>
    <button class="nav-toggle" onclick="document.querySelector('.nav-links').classList.toggle('open')">&#9776;</button>
    <ul class="nav-links">
      <li><a href="index.html">Home</a></li>
      <li><a href="pipeline.html">Pipeline</a></li>
      <li><a href="features.html">Features</a></li>
      <li><a href="papers.html" class="active">Papers</a></li>
      <li><a href="getting-started.html">Get Started</a></li>
      <li><a href="https://github.com/aiming-lab/AutoResearchClaw" class="nav-github" target="_blank">GitHub</a></li>
    </ul>
  </div>
</nav>

<!-- Header -->
<section class="hero" style="padding-bottom: 2rem;">
  <div class="container">
    <h1>Showcase <span class="gradient">Papers</span></h1>
    <p class="tagline">Every paper below was generated entirely by AutoResearchClaw &mdash; from a single topic prompt to a complete research manuscript with real experiments.</p>
  </div>
</section>

<!-- Paper Gallery -->
<section class="section" style="padding-top: 1rem;">
  <div class="container">

    <!-- Filter bar (future) -->
    <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; margin-bottom: 2rem;">
      <button class="btn btn-outline" style="padding: 0.4rem 1rem; font-size: 0.82rem;" onclick="filterPapers('all')">All</button>
      <button class="btn btn-outline" style="padding: 0.4rem 1rem; font-size: 0.82rem;" onclick="filterPapers('cv')">Computer Vision</button>
      <button class="btn btn-outline" style="padding: 0.4rem 1rem; font-size: 0.82rem;" onclick="filterPapers('nlp')">NLP</button>
      <button class="btn btn-outline" style="padding: 0.4rem 1rem; font-size: 0.82rem;" onclick="filterPapers('rl')">Reinforcement Learning</button>
      <button class="btn btn-outline" style="padding: 0.4rem 1rem; font-size: 0.82rem;" onclick="filterPapers('gen')">Generative Models</button>
    </div>

    <div class="paper-grid">

      <!-- Paper S1 -->
      <div class="paper-card" data-domain="cv">
        <div class="paper-thumb">&#128202;</div>
        <div class="paper-body">
          <h3>Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Computer Vision</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Investigates adaptive curriculum learning strategies on CIFAR-10/100 benchmarks.
            Proposes a difficulty-aware scheduling mechanism that dynamically adjusts training
            sample ordering to improve convergence speed and final accuracy.
          </p>
        </div>
      </div>

      <!-- Paper S2 -->
      <div class="paper-card" data-domain="nlp">
        <div class="paper-thumb">&#128172;</div>
        <div class="paper-body">
          <h3>Prompt-Length-Aware Routing for Mixture-of-LoRA Experts in Instruction-Following</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">NLP / PEFT</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Proposes a routing mechanism for Mixture-of-LoRA experts that considers prompt
            length characteristics. Fine-tunes Qwen-2.5-3B with QLoRA to demonstrate
            improved instruction-following across varying input lengths.
          </p>
        </div>
      </div>

      <!-- Paper S3 -->
      <div class="paper-card" data-domain="cv">
        <div class="paper-thumb">&#129516;</div>
        <div class="paper-body">
          <h3>Graph Attention Networks with Learnable Edge Features for Molecular Property Prediction</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">GNN / Chemistry</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Extends graph attention networks with learnable edge feature transformations
            for molecular property prediction on the OGB-MolHIV benchmark, achieving
            competitive performance with existing specialized architectures.
          </p>
        </div>
      </div>

      <!-- Paper S4 -->
      <div class="paper-card" data-domain="rl">
        <div class="paper-thumb">&#127918;</div>
        <div class="paper-body">
          <h3>Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Reinforcement Learning</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Proposes entropy-guided intrinsic reward bonuses to improve exploration
            efficiency in sparse-reward MuJoCo locomotion environments. Demonstrates
            improved sample efficiency over baseline algorithms.
          </p>
        </div>
      </div>

      <!-- Paper S5 -->
      <div class="paper-card" data-domain="gen">
        <div class="paper-thumb">&#127912;</div>
        <div class="paper-body">
          <h3>Spectral Normalization Effects on Mode Collapse in Conditional GANs for CIFAR-10</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Generative Models</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Systematically studies the effect of spectral normalization on mode collapse
            in conditional GANs trained on CIFAR-10, providing both visual and quantitative
            analysis (FID, IS) of generation diversity.
          </p>
        </div>
      </div>

      <!-- Paper S6 -->
      <div class="paper-card" data-domain="cv">
        <div class="paper-thumb">&#128260;</div>
        <div class="paper-body">
          <h3>Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift</h3>
          <div class="paper-meta">
            <span class="badge badge-domain">Domain Adaptation</span>
            <span class="badge badge-pending">Coming Soon</span>
          </div>
          <p class="paper-abstract">
            Explores lightweight test-time adaptation methods that update batch normalization
            statistics to handle distribution shift on CIFAR-10-C corruption benchmarks,
            demonstrating practical robustness improvements.
          </p>
        </div>
      </div>

    </div>

    <!-- Coming soon notice -->
    <div class="coming-soon mt-8">
      <div class="cs-icon">&#128640;</div>
      <h3>Papers Coming Soon</h3>
      <p>We're generating showcase papers across diverse ML subfields. Each paper will include a downloadable PDF, LaTeX source, experiment code, and quality assessment. Check back soon!</p>
    </div>

  </div>
</section>

<!-- Footer -->
<footer class="footer">
  <ul class="footer-links">
    <li><a href="index.html">Home</a></li>
    <li><a href="pipeline.html">Pipeline</a></li>
    <li><a href="features.html">Features</a></li>
    <li><a href="papers.html">Papers</a></li>
    <li><a href="getting-started.html">Get Started</a></li>
    <li><a href="https://github.com/aiming-lab/AutoResearchClaw" target="_blank">GitHub</a></li>
  </ul>
  <p>AutoResearchClaw &mdash; Autonomous Research Paper Generation Pipeline</p>
</footer>

<script>
function filterPapers(domain) {
  const cards = document.querySelectorAll('.paper-card');
  cards.forEach(card => {
    if (domain === 'all' || card.dataset.domain === domain) {
      card.style.display = '';
    } else {
      card.style.display = 'none';
    }
  });
}
</script>

</body>
</html>


================================================
FILE: website/pipeline.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Pipeline &mdash; AutoResearchClaw</title>
  <meta name="description" content="Explore the 23-stage autonomous research pipeline, from topic initialization to citation verification.">
  <link rel="icon" href="assets/logo.png" type="image/png">
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="style.css">
</head>
<body>

<!-- Nav -->
<nav class="navbar">
  <div class="container">
    <a href="index.html" class="nav-brand"><img src="assets/logo.png" alt="Logo">AutoResearchClaw</a>
    <button class="nav-toggle" onclick="document.querySelector('.nav-links').classList.toggle('open')">&#9776;</button>
    <ul class="nav-links">
      <li><a href="index.html">Home</a></li>
      <li><a href="pipeline.html" class="active">Pipeline</a></li>
      <li><a href="features.html">Features</a></li>
      <li><a href="papers.html">Papers</a></li>
      <li><a href="getting-started.html">Get Started</a></li>
      <li><a href="https://github.com/aiming-lab/AutoResearchClaw" class="nav-github" target="_blank">GitHub</a></li>
    </ul>
  </div>
</nav>

<!-- Header -->
<section class="hero" style="padding-bottom: 2rem;">
  <div class="container">
    <h1>The <span class="gradient">23-Stage</span> Pipeline</h1>
    <p class="tagline">Click any stage to expand its description. Yellow badges mark gate stages that require quality checks before proceeding.</p>
  </div>
</section>

<!-- Pipeline -->
<section class="section" style="padding-top: 1rem;">
  <div class="container pipeline-full" id="pipeline">

    <!-- Phase A -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">A</div>
        <h3>Research Scoping</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">1</div>
          <div class="stage-info">
            <h4>Topic Initialization</h4>
            <div class="stage-subtitle">Define research topic, scope, and target conference</div>
            <div class="stage-detail">
              Takes a user-provided topic prompt and generates a structured research plan including target conference, research questions, and expected contributions. Emphasizes novelty and alignment with recent conference trends.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">2</div>
          <div class="stage-info">
            <h4>Problem Decomposition</h4>
            <div class="stage-subtitle">Break research into sub-problems and objectives</div>
            <div class="stage-detail">
              Decomposes the research topic into concrete sub-problems, defines evaluation criteria, and identifies the key technical challenges to address.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase B -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">B</div>
        <h3>Literature Discovery</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">3</div>
          <div class="stage-info">
            <h4>Search Strategy</h4>
            <div class="stage-subtitle">Generate search queries and select paper sources</div>
            <div class="stage-detail">
              Generates targeted search queries from the research plan, selects which APIs to query (OpenAlex, Semantic Scholar, arXiv), and defines inclusion/exclusion criteria.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">4</div>
          <div class="stage-info">
            <h4>Literature Collect</h4>
            <div class="stage-subtitle">Search OpenAlex, Semantic Scholar, and arXiv</div>
            <div class="stage-detail">
              Executes multi-source literature search with intelligent caching, circuit breakers, and rate limiting. Deduplicates results across sources and injects seminal papers from the seed library.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">API</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number gate">5</div>
          <div class="stage-info">
            <h4>Literature Screen</h4>
            <div class="stage-subtitle">Quality and relevance screening (Gate)</div>
            <div class="stage-detail">
              <strong>Gate Stage.</strong> LLM reviews each collected paper for relevance, quality, and domain match. Cross-domain false positives are explicitly rejected. Papers below threshold are filtered out.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-gate">Gate</span>
                <span class="stage-badge stage-badge-llm">LLM</span>
              </div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">6</div>
          <div class="stage-info">
            <h4>Knowledge Extract</h4>
            <div class="stage-subtitle">Extract key insights and methodologies from papers</div>
            <div class="stage-detail">
              Extracts structured knowledge from screened papers: key contributions, methods, results, limitations, and open questions. Builds a knowledge graph for synthesis.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase C -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">C</div>
        <h3>Knowledge Synthesis</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">7</div>
          <div class="stage-info">
            <h4>Synthesis</h4>
            <div class="stage-subtitle">Gap analysis and research trend synthesis</div>
            <div class="stage-detail">
              Clusters extracted knowledge by topic, identifies research gaps, and synthesizes trends. Produces a structured literature review summary that informs hypothesis generation.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">8</div>
          <div class="stage-info">
            <h4>Hypothesis Generation</h4>
            <div class="stage-subtitle">Generate testable research hypotheses</div>
            <div class="stage-detail">
              Generates novel, testable hypotheses that address gaps not covered by existing literature. Each hypothesis includes expected outcomes, evaluation metrics, and ablation dimensions.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase D -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">D</div>
        <h3>Experiment Design</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number gate">9</div>
          <div class="stage-info">
            <h4>Experiment Design</h4>
            <div class="stage-subtitle">Methodology design and validation (Gate)</div>
            <div class="stage-detail">
              <strong>Gate Stage.</strong> Designs the complete experimental methodology: baselines, ablations, metrics, datasets, and statistical tests. Requires modern benchmarks and real datasets (CIFAR-10, etc.).
              <div class="stage-badges">
                <span class="stage-badge stage-badge-gate">Gate</span>
                <span class="stage-badge stage-badge-llm">LLM</span>
              </div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">10</div>
          <div class="stage-info">
            <h4>Code Generation</h4>
            <div class="stage-subtitle">Generate executable experiment code</div>
            <div class="stage-detail">
              Generates complete Python experiment code (main.py) with dataset loading, model definition, training loop, evaluation, and results output. Includes security validation, import checking, and code review.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-llm">LLM</span>
                <span class="stage-badge stage-badge-new">New</span>
              </div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">11</div>
          <div class="stage-info">
            <h4>Resource Planning</h4>
            <div class="stage-subtitle">Estimate compute budget and time allocation</div>
            <div class="stage-detail">
              Estimates GPU memory requirements, training time, and compute budget. Configures Docker sandbox resource limits and timeout values based on available hardware.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase E -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">E</div>
        <h3>Experiment Execution</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">12</div>
          <div class="stage-info">
            <h4>Experiment Run</h4>
            <div class="stage-subtitle">Execute experiments in Docker sandbox</div>
            <div class="stage-detail">
              Runs generated code inside an isolated Docker container with NVIDIA GPU passthrough. Captures stdout metrics, timing data, and exit codes. Pre-cached datasets available at /workspace/data.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-docker">Docker</span>
              </div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">13</div>
          <div class="stage-info">
            <h4>Iterative Refinement</h4>
            <div class="stage-subtitle">Fix errors and improve experiment code</div>
            <div class="stage-detail">
              If experiment fails or produces poor results, automatically diagnoses issues and generates refined code. Checks ablation effectiveness (>5% difference from baseline). Up to 3 refinement iterations.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-llm">LLM</span>
                <span class="stage-badge stage-badge-docker">Docker</span>
                <span class="stage-badge stage-badge-new">New</span>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase F -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">F</div>
        <h3>Analysis &amp; Decision</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">14</div>
          <div class="stage-info">
            <h4>Result Analysis</h4>
            <div class="stage-subtitle">Statistical analysis of experiment outcomes</div>
            <div class="stage-detail">
              Parses experiment outputs, computes statistical significance, generates comparison charts, and produces structured results summaries. Detects result contradictions and null findings.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">15</div>
          <div class="stage-info">
            <h4>Research Decision</h4>
            <div class="stage-subtitle">Pivot, refine, or proceed based on results</div>
            <div class="stage-detail">
              Evaluates experiment results and decides: <strong>Proceed</strong> (results support hypothesis), <strong>Refine</strong> (re-run with improvements), or <strong>Pivot</strong> (discard hypothesis, generate new one). Max 2 pivots to prevent infinite loops.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase G -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">G</div>
        <h3>Paper Writing</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">16</div>
          <div class="stage-info">
            <h4>Paper Outline</h4>
            <div class="stage-subtitle">Structure paper sections and arguments</div>
            <div class="stage-detail">
              Creates a detailed paper outline with section-by-section arguments, key claims, and figure placements. Follows conference template structure (abstract, intro, related work, method, experiments, conclusion).
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">17</div>
          <div class="stage-info">
            <h4>Paper Draft</h4>
            <div class="stage-subtitle">Write the full paper draft</div>
            <div class="stage-detail">
              Generates the complete paper in Markdown/LaTeX with structured writing rules: 150-200 word abstract, no number repetition across sections, proper citation of original papers for all discussed techniques.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">18</div>
          <div class="stage-info">
            <h4>Peer Review</h4>
            <div class="stage-subtitle">Multi-agent simulated conference review</div>
            <div class="stage-detail">
              Multiple LLM reviewer personas evaluate the paper: one technical reviewer, one methodology expert, and one clarity/presentation reviewer. Each provides structured feedback with scores.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">19</div>
          <div class="stage-info">
            <h4>Paper Revision</h4>
            <div class="stage-subtitle">Revise based on peer review feedback</div>
            <div class="stage-detail">
              Addresses reviewer comments systematically: fixes technical issues, improves writing clarity, adds missing comparisons, and strengthens the narrative. Produces a revised draft.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-llm">LLM</span>
                <span class="stage-badge stage-badge-new">New</span>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="connector"><div class="connector-line"></div></div>

    <!-- Phase H -->
    <div class="phase-group">
      <div class="phase-group-header">
        <div class="phase-letter">H</div>
        <h3>Finalization</h3>
      </div>
      <div class="stage-list">
        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number gate">20</div>
          <div class="stage-info">
            <h4>Quality Gate</h4>
            <div class="stage-subtitle">Final quality assessment (Gate)</div>
            <div class="stage-detail">
              <strong>Gate Stage.</strong> Comprehensive quality assessment scoring the paper on novelty, soundness, significance, clarity, and reproducibility. Papers below threshold are sent back for rewriting.
              <div class="stage-badges">
                <span class="stage-badge stage-badge-gate">Gate</span>
                <span class="stage-badge stage-badge-llm">LLM</span>
              </div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">21</div>
          <div class="stage-info">
            <h4>Knowledge Archive</h4>
            <div class="stage-subtitle">Archive findings to knowledge base</div>
            <div class="stage-detail">
              Stores research findings, methodology, and results in the persistent knowledge base for future reference and cross-project knowledge transfer. Non-critical: failure doesn't abort pipeline.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LLM</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">22</div>
          <div class="stage-info">
            <h4>Export &amp; Publish</h4>
            <div class="stage-subtitle">Generate LaTeX PDF and final output</div>
            <div class="stage-detail">
              Converts the paper to conference-ready LaTeX, compiles to PDF, generates BibTeX bibliography, and produces the final output package (paper.pdf, main.tex, references.bib, charts/).
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">LaTeX</span></div>
            </div>
          </div>
        </div>

        <div class="stage-item" onclick="this.classList.toggle('expanded')">
          <div class="stage-number">23</div>
          <div class="stage-info">
            <h4>Citation Verify</h4>
            <div class="stage-subtitle">Verify all citations against real databases</div>
            <div class="stage-detail">
              Verifies every cited paper exists in real academic databases. Checks DOI via CrossRef, title via OpenAlex, arXiv ID via arXiv API, and falls back to Semantic Scholar. Non-critical: failure doesn't abort pipeline.
              <div class="stage-badges"><span class="stage-badge stage-badge-llm">API</span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

  </div>
</section>

<!-- Footer -->
<footer class="footer">
  <ul class="footer-links">
    <li><a href="index.html">Home</a></li>
    <li><a href="pipeline.html">Pipeline</a></li>
    <li><a href="features.html">Features</a></li>
    <li><a href="papers.html">Papers</a></li>
    <li><a href="getting-started.html">Get Started</a></li>
    <li><a href="https://github.com/aiming-lab/AutoResearchClaw" target="_blank">GitHub</a></li>
  </ul>
  <p>AutoResearchClaw &mdash; Autonomous Research Paper Generation Pipeline</p>
</footer>

</body>
</html>


================================================
FILE: website/style.css
================================================
/* ============================================================
   AutoResearchClaw — Showcase Website Styles
   Pure CSS, no build step.  Tailwind-inspired utility patterns.
   ============================================================ */

/* ---------- Reset & Variables ---------- */
:root {
  --color-bg:        #0f172a;
  --color-bg-alt:    #1e293b;
  --color-surface:   #334155;
  --color-border:    #475569;
  --color-text:      #e2e8f0;
  --color-text-muted:#94a3b8;
  --color-primary:   #38bdf8;
  --color-primary-d: #0284c7;
  --color-accent:    #a78bfa;
  --color-accent-d:  #7c3aed;
  --color-success:   #4ade80;
  --color-warning:   #fbbf24;
  --color-danger:    #f87171;
  --color-white:     #f8fafc;

  --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
  --font-mono: 'JetBrains Mono', 'Fira Code', 'Consolas', monospace;

  --radius:  0.75rem;
  --radius-lg: 1rem;
  --shadow:  0 4px 6px -1px rgba(0,0,0,.3), 0 2px 4px -2px rgba(0,0,0,.2);
  --shadow-lg: 0 10px 15px -3px rgba(0,0,0,.4), 0 4px 6px -4px rgba(0,0,0,.3);
  --transition: 0.2s ease;
}

*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }

html { scroll-behavior: smooth; font-size: 16px; }

body {
  font-family: var(--font-sans);
  background: var(--color-bg);
  color: var(--color-text);
  line-height: 1.7;
  min-height: 100vh;
}

a { color: var(--color-primary); text-decoration: none; transition: color var(--transition); }
a:hover { color: var(--color-accent); }

img { max-width: 100%; height: auto; display: block; }

/* ---------- Layout ---------- */
.container { max-width: 1200px; margin: 0 auto; padding: 0 1.5rem; }
.section   { padding: 5rem 0; }

/* ---------- Navigation ---------- */
.navbar {
  position: fixed; top: 0; left: 0; right: 0; z-index: 100;
  background: rgba(15, 23, 42, 0.85);
  backdrop-filter: blur(12px);
  border-bottom: 1px solid var(--color-border);
  padding: 0.75rem 0;
}
.navbar .container {
  display: flex; align-items: center; justify-content: space-between;
}
.nav-brand {
  display: flex; align-items: center; gap: 0.75rem;
  font-weight: 700; font-size: 1.15rem; color: var(--color-white);
}
.nav-brand img { height: 32px; width: 32px; border-radius: 6px; }
.nav-links { display: flex; gap: 1.5rem; list-style: none; }
.nav-links a {
  color: var(--color-text-muted); font-size: 0.9rem; font-weight: 500;
  padding: 0.4rem 0; transition: color var(--transition);
}
.nav-links a:hover, .nav-links a.active { color: var(--color-primary); }
.nav-github {
  display: inline-flex; align-items: center; gap: 0.4rem;
  background: var(--color-surface); color: var(--color-white);
  padding: 0.45rem 1rem; border-radius: 9999px; font-size: 0.85rem;
  font-weight: 600; transition: background var(--transition);
}
.nav-github:hover { background: var(--color-primary-d); color: var(--color-white); }

/* Mobile nav toggle */
.nav-toggle { display: none; background: none; border: none; color: var(--color-text); font-size: 1.5rem; cursor: pointer; }

@media (max-width: 768px) {
  .nav-toggle { display: block; }
  .nav-links {
    display: none; flex-direction: column; position: absolute;
    top: 100%; left: 0; right: 0; background: var(--color-bg-alt);
    padding: 1rem 1.5rem; gap: 0.5rem; border-bottom: 1px solid var(--color-border);
  }
  .nav-links.open { display: flex; }
}

/* ---------- Hero ---------- */
.hero {
  padding: 10rem 0 5rem;
  text-align: center;
  background: linear-gradient(180deg, rgba(56,189,248,0.08) 0%, transparent 60%);
}
.hero h1 {
  font-size: clamp(2rem, 5vw, 3.5rem);
  font-weight: 800;
  line-height: 1.15;
  margin-bottom: 1rem;
}
.hero h1 .gradient {
  background: linear-gradient(135deg, var(--color-primary), var(--color-accent));
  -webkit-background-clip: text; background-clip: text;
  -webkit-text-fill-color: transparent;
}
.hero .tagline {
  font-size: clamp(1.05rem, 2vw, 1.35rem);
  color: var(--color-text-muted);
  max-width: 640px; margin: 0 auto 2rem;
}
.hero-actions { display: flex; gap: 1rem; justify-content: center; flex-wrap: wrap; }
.btn {
  display: inline-flex; align-items: center; gap: 0.5rem;
  padding: 0.75rem 1.75rem; border-radius: 9999px;
  font-weight: 600; font-size: 0.95rem; cursor: pointer;
  border: none; transition: all var(--transition);
}
.btn-primary {
  background: linear-gradient(135deg, var(--color-primary), var(--color-accent));
  color: #0f172a;
}
.btn-primary:hover { transform: translateY(-2px); box-shadow: var(--shadow-lg); color: #0f172a; }
.btn-outline {
  background: transparent; color: var(--color-primary);
  border: 2px solid var(--color-primary);
}
.btn-outline:hover { background: rgba(56,189,248,0.1); color: var(--color-primary); }

.hero-code {
  margin-top: 2.5rem;
  display: inline-block;
  background: var(--color-bg-alt);
  border: 1px solid var(--color-border);
  border-radius: var(--radius);
  padding: 0.8rem 1.5rem;
  font-family: var(--font-mono);
  font-size: 0.9rem;
  color: var(--color-success);
}

/* ---------- Stats ---------- */
.stats {
  display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
  gap: 1.5rem; padding: 3rem 0;
}
.stat {
  text-align: center; padding: 1.5rem;
  background: var(--color-bg-alt); border-radius: var(--radius);
  border: 1px solid var(--color-border);
}
.stat-value {
  font-size: 2rem; font-weight: 800;
  background: linear-gradient(135deg, var(--color-primary), var(--color-accent));
  -webkit-background-clip: text; background-clip: text;
  -webkit-text-fill-color: transparent;
}
.stat-label { font-size: 0.85rem; color: var(--color-text-muted); margin-top: 0.25rem; }

/* ---------- Section headings ---------- */
.section-header {
  text-align: center; margin-bottom: 3rem;
}
.section-header h2 {
  font-size: clamp(1.5rem, 3vw, 2.25rem);
  font-weight: 700; margin-bottom: 0.5rem;
}
.section-header p {
  color: var(--color-text-muted); max-width: 600px; margin: 0 auto;
}

/* ---------- Pipeline Overview (Landing) ---------- */
.pipeline-preview {
  display: grid; grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
  gap: 1rem;
}
.phase-card {
  background: var(--color-bg-alt); border-radius: var(--radius);
  border: 1px solid var(--color-border);
  padding: 1.5rem; transition: all var(--transition);
}
.phase-card:hover {
  border-color: var(--color-primary);
  transform: translateY(-3px); box-shadow: var(--shadow-lg);
}
.phase-card .phase-icon { font-size: 1.75rem; margin-bottom: 0.75rem; }
.phase-card h3 { font-size: 1rem; font-weight: 600; margin-bottom: 0.4rem; }
.phase-card p  { font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.5; }
.phase-card .phase-stages {
  margin-top: 0.75rem; display: flex; gap: 0.35rem; flex-wrap: wrap;
}
.stage-dot {
  width: 8px; height: 8px; border-radius: 50%;
  background: var(--color-primary); opacity: 0.5;
}
.stage-dot.gate { background: var(--color-warning); opacity: 1; }

/* ---------- Paper Cards ---------- */
.paper-grid {
  display: grid; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
  gap: 1.5rem;
}
.paper-card {
  background: var(--color-bg-alt); border-radius: var(--radius-lg);
  border: 1px solid var(--color-border);
  overflow: hidden; transition: all var(--transition);
}
.paper-card:hover {
  border-color: var(--color-primary);
  transform: translateY(-3px); box-shadow: var(--shadow-lg);
}
.paper-thumb {
  height: 180px; background: linear-gradient(135deg, var(--color-surface), var(--color-bg));
  display: flex; align-items: center; justify-content: center;
  font-size: 3rem; color: var(--color-text-muted);
}
.paper-body { padding: 1.25rem; }
.paper-body h3 { font-size: 1rem; font-weight: 600; line-height: 1.4; margin-bottom: 0.5rem; }
.paper-body .paper-meta {
  display: flex; gap: 0.5rem; flex-wrap: wrap; margin-bottom: 0.75rem;
}
.badge {
  display: inline-flex; align-items: center; gap: 0.25rem;
  padding: 0.2rem 0.6rem; border-radius: 9999px;
  font-size: 0.72rem; font-weight: 600;
}
.badge-domain { background: rgba(56,189,248,0.15); color: var(--color-primary); }
.badge-score { background: rgba(74,222,128,0.15); color: var(--color-success); }
.badge-pending { background: rgba(251,191,36,0.15); color: var(--color-warning); }
.paper-body .paper-abstract {
  font-size: 0.82rem; color: var(--color-text-muted);
  line-height: 1.55; display: -webkit-box;
  -webkit-line-clamp: 3; -webkit-box-orient: vertical; overflow: hidden;
}

/* ---------- Feature Cards ---------- */
.feature-grid {
  display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
  gap: 1.5rem;
}
.feature-card {
  background: var(--color-bg-alt); border-radius: var(--radius);
  border: 1px solid var(--color-border);
  padding: 1.75rem; transition: all var(--transition);
}
.feature-card:hover {
  border-color: var(--color-accent);
  transform: translateY(-2px);
}
.feature-card .feature-icon {
  width: 48px; height: 48px; border-radius: 12px;
  display: flex; align-items: center; justify-content: center;
  font-size: 1.4rem; margin-bottom: 1rem;
}
.feature-card .feature-icon.bg-blue   { background: rgba(56,189,248,0.15); }
.feature-card .feature-icon.bg-purple { background: rgba(167,139,250,0.15); }
.feature-card .feature-icon.bg-green  { background: rgba(74,222,128,0.15); }
.feature-card .feature-icon.bg-amber  { background: rgba(251,191,36,0.15); }
.feature-card .feature-icon.bg-red    { background: rgba(248,113,113,0.15); }
.feature-card h3 { font-size: 1.05rem; font-weight: 600; margin-bottom: 0.4rem; }
.feature-card p  { font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.5; }

/* ---------- Pipeline Page (full) ---------- */
.pipeline-full { max-width: 860px; margin: 0 auto; }

.phase-group { margin-bottom: 3rem; }
.phase-group-header {
  display: flex; align-items: center; gap: 0.75rem;
  margin-bottom: 1rem; padding-bottom: 0.5rem;
  border-bottom: 2px solid var(--color-border);
}
.phase-group-header .phase-letter {
  width: 36px; height: 36px; border-radius: 10px;
  display: flex; align-items: center; justify-content: center;
  font-weight: 800; font-size: 0.85rem;
  background: linear-gradient(135deg, var(--color-primary), var(--color-accent));
  color: #0f172a;
}
.phase-group-header h3 { font-size: 1.15rem; font-weight: 600; }

.stage-list { display: flex; flex-direction: column; gap: 0.5rem; }

.stage-item {
  display: flex; align-items: flex-start; gap: 1rem;
  padding: 1rem 1.25rem;
  background: var(--color-bg-alt);
  border-radius: var(--radius);
  border: 1px solid var(--color-border);
  cursor: pointer; transition: all var(--transition);
}
.stage-item:hover { border-color: var(--color-primary); }
.stage-item.expanded { border-color: var(--color-primary); background: rgba(56,189,248,0.04); }

.stage-number {
  flex-shrink: 0; width: 32px; height: 32px; border-radius: 8px;
  display: flex; align-items: center; justify-content: center;
  font-weight: 700; font-size: 0.8rem;
  background: var(--color-surface); color: var(--color-text);
}
.stage-number.gate { background: rgba(251,191,36,0.2); color: var(--color-warning); }

.stage-info { flex: 1; }
.stage-info h4 { font-size: 0.95rem; font-weight: 600; margin-bottom: 0.15rem; }
.stage-info .stage-subtitle { font-size: 0.8rem; color: var(--color-text-muted); }
.stage-detail {
  display: none; margin-top: 0.75rem; padding-top: 0.75rem;
  border-top: 1px solid var(--color-border);
  font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.6;
}
.stage-item.expanded .stage-detail { display: block; }

.stage-badges { display: flex; gap: 0.35rem; flex-wrap: wrap; margin-top: 0.5rem; }
.stage-badge {
  padding: 0.15rem 0.5rem; border-radius: 9999px;
  font-size: 0.7rem; font-weight: 600;
}
.stage-badge-gate   { background: rgba(251,191,36,0.15); color: var(--color-warning); }
.stage-badge-new    { background: rgba(167,139,250,0.15); color: var(--color-accent); }
.stage-badge-llm    { background: rgba(56,189,248,0.15); color: var(--color-primary); }
.stage-badge-docker { background: rgba(74,222,128,0.15); color: var(--color-success); }

/* ---------- Footer ---------- */
.footer {
  padding: 3rem 0; text-align: center;
  border-top: 1px solid var(--color-border);
}
.footer p { color: var(--color-text-muted); font-size: 0.85rem; }
.footer-links {
  display: flex; justify-content: center; gap: 1.5rem;
  list-style: none; margin-bottom: 1rem;
}
.footer-links a { color: var(--color-text-muted); font-size: 0.85rem; }
.footer-links a:hover { color: var(--color-primary); }

/* ---------- Getting Started ---------- */
.getting-started-content { max-width: 780px; margin: 0 auto; }
.step-block {
  background: var(--color-bg-alt); border-radius: var(--radius);
  border: 1px solid var(--color-border);
  padding: 1.5rem; margin-bottom: 1rem;
}
.step-block h3 {
  display: flex; align-items: center; gap: 0.75rem;
  font-size: 1.05rem; font-weight: 600; margin-bottom: 0.75rem;
}
.step-num {
  width: 28px; height: 28px; border-radius: 50%;
  display: inline-flex; align-items: center; justify-content: center;
  font-size: 0.8rem; font-weight: 700;
  background: var(--color-primary); color: #0f172a;
}
.code-block {
  background: #0d1117; border-radius: 8px;
  padding: 1rem 1.25rem; margin-top: 0.5rem;
  font-family: var(--font-mono); font-size: 0.85rem;
  color: var(--color-success); overflow-x: auto;
  border: 1px solid #21262d;
}
.code-block .comment { color: var(--color-text-muted); }

/* ---------- Connector Line (pipeline page) ---------- */
.connector {
  display: flex; justify-content: center; padding: 0.5rem 0;
}
.connector-line {
  width: 2px; height: 24px;
  background: linear-gradient(180deg, var(--color-primary), var(--color-accent));
  opacity: 0.4;
}

/* ---------- Coming Soon Overlay ---------- */
.coming-soon {
  display: flex; flex-direction: column; align-items: center;
  justify-content: center; padding: 4rem 2rem;
  text-align: center;
}
.coming-soon .cs-icon { font-size: 3rem; margin-bottom: 1rem; }
.coming-soon h3 { font-size: 1.25rem; margin-bottom: 0.5rem; }
.coming-soon p { color: var(--color-text-muted); max-width: 400px; }

/* ---------- Utilities ---------- */
.text-center { text-align: center; }
.mt-2 { margin-top: 0.5rem; }
.mt-4 { margin-top: 1rem; }
.mt-8 { margin-top: 2rem; }
.mb-4 { margin-top: 1rem; }
.gap-2 { gap: 0.5rem; }
.flex { display: flex; }
.items-center { align-items: center; }
.justify-center { justify-content: center; }